From 5f2fca226b68bbaba2744f45e829dead4df3bf3c Mon Sep 17 00:00:00 2001 From: Louis Mylle Date: Wed, 10 Sep 2025 22:20:09 +0200 Subject: [PATCH] Enhance scraper functionality: update README for installation, add SSL handling in main.py, implement credential storage, and create start.bat for easy execution --- README.md | 2 +- main.py | 64 +++++++++++++++++++++++++++++++++++++++++++++--- requirements.txt | 3 ++- start.bat | 3 +++ 4 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 start.bat diff --git a/README.md b/README.md index 58f645f..2362efd 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ De ChromeDriver wordt automatisch geïnstalleerd bij het eerste gebruik. Zorg da Open Command Prompt in de projectmap en voer uit: ``` -pip install selenium requests +python -m pip install selenium requests urllib3 ``` ### Stap 4: Login gegevens instellen diff --git a/main.py b/main.py index 4c4ad15..76ef6b0 100644 --- a/main.py +++ b/main.py @@ -4,6 +4,11 @@ from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.options import Options import time import random +import os + +# Disable SSL verification warnings and errors +import urllib3 +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) class Scraper: def __init__(self, headless=False): @@ -11,12 +16,30 @@ class Scraper: if headless: chrome_options.add_argument('--headless') + # Fix SSL and certificate issues + chrome_options.add_argument('--ignore-ssl-errors') + chrome_options.add_argument('--ignore-certificate-errors') + chrome_options.add_argument('--disable-web-security') + chrome_options.add_argument('--allow-running-insecure-content') + chrome_options.add_argument('--disable-extensions') + + # Fix DevTools connection issues + chrome_options.add_argument('--remote-debugging-port=0') + chrome_options.add_argument('--disable-dev-shm-usage') + chrome_options.add_argument('--no-sandbox') + # Make it look more human chrome_options.add_argument('--disable-blink-features=AutomationControlled') chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option('useAutomationExtension', False) chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36') + # Suppress logging + chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) + chrome_options.add_experimental_option('useAutomationExtension', False) + chrome_options.add_argument('--disable-logging') + chrome_options.add_argument('--log-level=3') + self.driver = webdriver.Chrome(options=chrome_options) self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") @@ -157,9 +180,12 @@ class Scraper: print(f"Triggering download: {file_name}") self.trigger_download(file_url) - # Small random delay between downloads + # Human-like delay between downloads if j < len(download_links) - 1: - self.human_delay(0.5, 1.5) + # Longer delay for multiple downloads (human would wait and check) + delay_time = random.uniform(2, 5) + print(f"Waiting {delay_time:.1f} seconds before next download...") + time.sleep(delay_time) # Take a longer break every 5 comics if i % 5 == 0 and i < len(comic_urls): @@ -182,11 +208,43 @@ class Scraper: def close(self): self.driver.quit() +def update_credentials_in_file(username, password): + """Update the credentials in this file for future use""" + # Read the current file + with open(__file__, 'r', encoding='utf-8') as f: + content = f.read() + + # Replace the placeholder credentials + content = content.replace('scraper.login("xxx", "yyy")', + f'scraper.login("{username}", "{password}")') + + # Write back to file + with open(__file__, 'w', encoding='utf-8') as f: + f.write(content) + + print("Credentials saved for future use!") + if __name__ == "__main__": + # Check if credentials need to be set + username = "jouw_gebruikersnaam" + password = "jouw_wachtwoord" + + if username == "jouw_gebruikersnaam" or password == "jouw_wachtwoord": + print("First time setup: Please enter your EBoek.info credentials") + new_username = input("Username: ") + new_password = input("Password: ") + + # Update the file with new credentials + update_credentials_in_file(new_username, new_password) + + # Use the new credentials + username = new_username + password = new_password + scraper = Scraper() # Login first - scraper.login("jouw_gebruikersnaam", "jouw_wachtwoord") + scraper.login(username, password) # Ask which page(s) to scrape start = int(input("Enter start page number (1 for first page): ")) diff --git a/requirements.txt b/requirements.txt index 954f0db..3301784 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -selenium \ No newline at end of file +selenium +urllib3 \ No newline at end of file diff --git a/start.bat b/start.bat new file mode 100644 index 0000000..798c33b --- /dev/null +++ b/start.bat @@ -0,0 +1,3 @@ +@echo off +cd /d %~dp0 +start cmd /k "python main.py" \ No newline at end of file