Enhance scraper functionality: update README for installation, add SSL handling in main.py, implement credential storage, and create start.bat for easy execution
This commit is contained in:
@@ -28,7 +28,7 @@ De ChromeDriver wordt automatisch geïnstalleerd bij het eerste gebruik. Zorg da
|
||||
|
||||
Open Command Prompt in de projectmap en voer uit:
|
||||
```
|
||||
pip install selenium requests
|
||||
python -m pip install selenium requests urllib3
|
||||
```
|
||||
|
||||
### Stap 4: Login gegevens instellen
|
||||
|
||||
64
main.py
64
main.py
@@ -4,6 +4,11 @@ from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
import time
|
||||
import random
|
||||
import os
|
||||
|
||||
# Disable SSL verification warnings and errors
|
||||
import urllib3
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
class Scraper:
|
||||
def __init__(self, headless=False):
|
||||
@@ -11,12 +16,30 @@ class Scraper:
|
||||
if headless:
|
||||
chrome_options.add_argument('--headless')
|
||||
|
||||
# Fix SSL and certificate issues
|
||||
chrome_options.add_argument('--ignore-ssl-errors')
|
||||
chrome_options.add_argument('--ignore-certificate-errors')
|
||||
chrome_options.add_argument('--disable-web-security')
|
||||
chrome_options.add_argument('--allow-running-insecure-content')
|
||||
chrome_options.add_argument('--disable-extensions')
|
||||
|
||||
# Fix DevTools connection issues
|
||||
chrome_options.add_argument('--remote-debugging-port=0')
|
||||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
|
||||
# Make it look more human
|
||||
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
|
||||
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||
chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
|
||||
|
||||
# Suppress logging
|
||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
|
||||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||
chrome_options.add_argument('--disable-logging')
|
||||
chrome_options.add_argument('--log-level=3')
|
||||
|
||||
self.driver = webdriver.Chrome(options=chrome_options)
|
||||
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
||||
|
||||
@@ -157,9 +180,12 @@ class Scraper:
|
||||
print(f"Triggering download: {file_name}")
|
||||
self.trigger_download(file_url)
|
||||
|
||||
# Small random delay between downloads
|
||||
# Human-like delay between downloads
|
||||
if j < len(download_links) - 1:
|
||||
self.human_delay(0.5, 1.5)
|
||||
# Longer delay for multiple downloads (human would wait and check)
|
||||
delay_time = random.uniform(2, 5)
|
||||
print(f"Waiting {delay_time:.1f} seconds before next download...")
|
||||
time.sleep(delay_time)
|
||||
|
||||
# Take a longer break every 5 comics
|
||||
if i % 5 == 0 and i < len(comic_urls):
|
||||
@@ -182,11 +208,43 @@ class Scraper:
|
||||
def close(self):
|
||||
self.driver.quit()
|
||||
|
||||
def update_credentials_in_file(username, password):
|
||||
"""Update the credentials in this file for future use"""
|
||||
# Read the current file
|
||||
with open(__file__, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
# Replace the placeholder credentials
|
||||
content = content.replace('scraper.login("xxx", "yyy")',
|
||||
f'scraper.login("{username}", "{password}")')
|
||||
|
||||
# Write back to file
|
||||
with open(__file__, 'w', encoding='utf-8') as f:
|
||||
f.write(content)
|
||||
|
||||
print("Credentials saved for future use!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Check if credentials need to be set
|
||||
username = "jouw_gebruikersnaam"
|
||||
password = "jouw_wachtwoord"
|
||||
|
||||
if username == "jouw_gebruikersnaam" or password == "jouw_wachtwoord":
|
||||
print("First time setup: Please enter your EBoek.info credentials")
|
||||
new_username = input("Username: ")
|
||||
new_password = input("Password: ")
|
||||
|
||||
# Update the file with new credentials
|
||||
update_credentials_in_file(new_username, new_password)
|
||||
|
||||
# Use the new credentials
|
||||
username = new_username
|
||||
password = new_password
|
||||
|
||||
scraper = Scraper()
|
||||
|
||||
# Login first
|
||||
scraper.login("jouw_gebruikersnaam", "jouw_wachtwoord")
|
||||
scraper.login(username, password)
|
||||
|
||||
# Ask which page(s) to scrape
|
||||
start = int(input("Enter start page number (1 for first page): "))
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
selenium
|
||||
selenium
|
||||
urllib3
|
||||
Reference in New Issue
Block a user