from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.options import Options import time import random import os # Disable SSL verification warnings and errors import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) class Scraper: def __init__(self, headless=False): chrome_options = Options() if headless: chrome_options.add_argument('--headless') # Fix SSL and certificate issues chrome_options.add_argument('--ignore-ssl-errors') chrome_options.add_argument('--ignore-certificate-errors') chrome_options.add_argument('--disable-web-security') chrome_options.add_argument('--allow-running-insecure-content') chrome_options.add_argument('--disable-extensions') # Fix DevTools connection issues chrome_options.add_argument('--remote-debugging-port=0') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--no-sandbox') # Make it look more human chrome_options.add_argument('--disable-blink-features=AutomationControlled') chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option('useAutomationExtension', False) chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36') # Suppress logging chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) chrome_options.add_experimental_option('useAutomationExtension', False) chrome_options.add_argument('--disable-logging') chrome_options.add_argument('--log-level=3') self.driver = webdriver.Chrome(options=chrome_options) self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") def human_delay(self, min_sec=0.5, max_sec=2): time.sleep(random.uniform(min_sec, max_sec)) def human_type(self, element, text): for char in text: element.send_keys(char) time.sleep(random.uniform(0.05, 0.15)) def navigate(self, url): self.driver.get(url) self.human_delay(1, 3) def login(self, username, password): self.driver.get("https://eboek.info/komerin") self.human_delay(2, 4) # Direct selectors based on what worked username_field = self.driver.find_element(By.CSS_SELECTOR, "input[type='text']") self.human_type(username_field, username) self.human_delay(0.5, 1) password_field = self.driver.find_element(By.CSS_SELECTOR, "input[type='password']") self.human_type(password_field, password) self.human_delay(0.5, 1.5) submit_button = self.driver.find_element(By.CSS_SELECTOR, "input[type='submit']") submit_button.click() self.human_delay(2, 4) def trigger_download(self, url): """Open URL in new tab to trigger browser download""" # Store current window handle current_window = self.driver.current_window_handle # Use JavaScript to open URL in new tab with same session self.driver.execute_script(f"window.open('{url}', '_blank');") # Wait for download to complete and tab to auto-close self.human_delay(3, 5) # Switch back to original window self.driver.switch_to.window(current_window) print(f"Download triggered for: {url}") def scrape(self, start_page=1, end_page=1): """Scrape comics from specified page range""" base_url = "https://eboek.info/stripverhalen-alle" for page_num in range(start_page, end_page + 1): # Construct page URL if page_num == 1: page_url = base_url else: page_url = f"{base_url}/page/{page_num}/" print(f"\n{'='*50}") print(f"Processing page {page_num}: {page_url}") print(f"{'='*50}") # Navigate to the page self.navigate(page_url) # Scroll down a bit like a human would to see content self.driver.execute_script("window.scrollTo(0, 300)") self.human_delay(1, 2) # Find all comic strip links comic_links = self.driver.find_elements(By.CSS_SELECTOR, 'h2.post-title a') print(f"Found {len(comic_links)} comic strips on page {page_num}") # Store URLs first to avoid stale element issues comic_urls = [link.get_attribute('href') for link in comic_links] # Take a break between pages (more likely and longer) if page_num > start_page: if random.random() < 0.7: # 70% chance of break break_time = random.uniform(15, 45) # 15-45 seconds print(f"\nTaking a break between pages for {break_time:.1f} seconds...") time.sleep(break_time) else: # Even if no long break, always pause a bit short_break = random.uniform(5, 10) print(f"\nQuick pause for {short_break:.1f} seconds...") time.sleep(short_break) # Process all comics on this page for i, url in enumerate(comic_urls, 1): print(f"\nProcessing comic {i}/{len(comic_urls)} on page {page_num}: {url}") # Random chance to scroll on main page before clicking if random.random() < 0.4: scroll_amount = random.randint(100, 500) self.driver.execute_script(f"window.scrollBy(0, {scroll_amount})") self.human_delay(0.5, 1.5) # Open in new tab to keep main page self.driver.execute_script("window.open('');") self.driver.switch_to.window(self.driver.window_handles[-1]) try: self.driver.get(url) self.human_delay(2, 4) # Sometimes scroll down to see the content if random.random() < 0.6: self.driver.execute_script("window.scrollTo(0, 400)") self.human_delay(0.5, 1.5) # Extract title title = self.driver.find_element(By.CSS_SELECTOR, 'h1.entry-title').text print(f"Title: {title}") # Small delay before clicking download self.human_delay(0.8, 2) # Execute the downloadLinks() JavaScript function self.driver.execute_script("downloadLinks()") self.human_delay(1.5, 3) # Find all download links in the table download_links = self.driver.find_elements(By.CSS_SELECTOR, 'table a') print(f"Found {len(download_links)} download links") # Trigger download for each file for j, link in enumerate(download_links): file_url = link.get_attribute('href') file_name = link.text.strip() print(f"Triggering download: {file_name}") self.trigger_download(file_url) # Human-like delay between downloads if j < len(download_links) - 1: # Longer delay for multiple downloads (human would wait and check) delay_time = random.uniform(2, 5) print(f"Waiting {delay_time:.1f} seconds before next download...") time.sleep(delay_time) # Take a longer break every 5 comics if i % 5 == 0 and i < len(comic_urls): break_time = random.uniform(3, 7) print(f"\nTaking a break for {break_time:.1f} seconds...") time.sleep(break_time) except Exception as e: print(f"Error processing {url}: {e}") # Human would pause after an error self.human_delay(2, 4) # Close tab and switch back self.driver.close() self.driver.switch_to.window(self.driver.window_handles[0]) # Vary the delay between comics self.human_delay(1, 3) def close(self): self.driver.quit() def update_credentials_in_file(username, password): """Update the credentials in this file for future use""" # Read the current file with open(__file__, 'r', encoding='utf-8') as f: content = f.read() # Replace the placeholder credentials content = content.replace('scraper.login("xxx", "yyy")', f'scraper.login("{username}", "{password}")') # Write back to file with open(__file__, 'w', encoding='utf-8') as f: f.write(content) print("Credentials saved for future use!") if __name__ == "__main__": # Check if credentials need to be set username = "jouw_gebruikersnaam" password = "jouw_wachtwoord" if username == "jouw_gebruikersnaam" or password == "jouw_wachtwoord": print("First time setup: Please enter your EBoek.info credentials") new_username = input("Username: ") new_password = input("Password: ") # Update the file with new credentials update_credentials_in_file(new_username, new_password) # Use the new credentials username = new_username password = new_password scraper = Scraper() # Login first scraper.login(username, password) # Ask which page(s) to scrape start = int(input("Enter start page number (1 for first page): ")) end = int(input("Enter end page number (same as start for single page): ")) # Scrape the specified pages scraper.scrape(start_page=start, end_page=end) # Keep browser open input("\nDone! Press Enter to close the browser...") scraper.close()