""" Core scraper functionality extracted from main.py with callback support for GUI integration. """ from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.options import Options import time import random import os import sys from pathlib import Path # Disable SSL verification warnings and errors import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) class Scraper: """ EBoek.info web scraper with GUI callback support. This class handles the core scraping functionality while providing callback mechanisms for progress updates to a GUI application. """ def __init__(self, headless=False, progress_callback=None, scraping_mode=0): """ Initialize the scraper with optional GUI callback support. Args: headless (bool): Whether to run Chrome in headless mode progress_callback (callable): Optional callback function for progress updates Callback signature: callback(event_type: str, data: dict) scraping_mode (int): Scraping mode (0=All Comics, 1=Latest Comics) """ self.progress_callback = progress_callback self._stop_requested = False self.scraping_mode = scraping_mode # Set up Chrome options with anti-detection measures chrome_options = Options() if headless: chrome_options.add_argument('--headless') # Fix SSL and certificate issues chrome_options.add_argument('--ignore-ssl-errors') chrome_options.add_argument('--ignore-certificate-errors') chrome_options.add_argument('--disable-web-security') chrome_options.add_argument('--allow-running-insecure-content') chrome_options.add_argument('--disable-extensions') # Fix DevTools connection issues chrome_options.add_argument('--remote-debugging-port=0') chrome_options.add_argument('--disable-dev-shm-usage') chrome_options.add_argument('--no-sandbox') # Make it look more human chrome_options.add_argument('--disable-blink-features=AutomationControlled') chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option('useAutomationExtension', False) chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36') # Suppress logging chrome_options.add_experimental_option('excludeSwitches', ['enable-logging']) chrome_options.add_experimental_option('useAutomationExtension', False) chrome_options.add_argument('--disable-logging') chrome_options.add_argument('--log-level=3') # Set cross-platform download directory downloads_path = str(Path.home() / "Downloads") prefs = { "download.default_directory": downloads_path, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": True } chrome_options.add_experimental_option("prefs", prefs) self.driver = webdriver.Chrome(options=chrome_options) self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") self._emit_progress("scraper_initialized", {"headless": headless, "downloads_path": downloads_path}) def _emit_progress(self, event_type, data): """ Internal method to emit progress updates via callback. Args: event_type (str): Type of event (e.g., 'page_started', 'comic_completed') data (dict): Event data """ if self.progress_callback: try: self.progress_callback(event_type, data) except Exception as e: # Don't let callback errors crash the scraper pass def request_stop(self): """Request the scraper to stop gracefully at the next opportunity.""" self._stop_requested = True self._emit_progress("stop_requested", {}) def human_delay(self, min_sec=0.5, max_sec=2): """ Simulate human-like delay with cancellation support. Args: min_sec (float): Minimum delay time max_sec (float): Maximum delay time """ if self._stop_requested: return delay_time = random.uniform(min_sec, max_sec) self._emit_progress("delay_started", {"duration": delay_time}) time.sleep(delay_time) def human_type(self, element, text): """ Type text character by character with human-like delays. Args: element: Selenium web element to type into text (str): Text to type """ for char in text: if self._stop_requested: return element.send_keys(char) time.sleep(random.uniform(0.05, 0.15)) def navigate(self, url): """ Navigate to a URL with human-like delay. Args: url (str): URL to navigate to """ if self._stop_requested: return False self._emit_progress("navigation_started", {"url": url}) self.driver.get(url) self.human_delay(1, 3) self._emit_progress("navigation_completed", {"url": url}) return True def login(self, username, password): """ Login to EBoek.info with provided credentials. Args: username (str): Username for login password (str): Password for login Returns: bool: True if login successful, False otherwise """ if self._stop_requested: return False self._emit_progress("login_started", {"username": username}) try: self.driver.get("https://eboek.info/komerin") self.human_delay(2, 4) if self._stop_requested: return False # Find and fill username field username_field = self.driver.find_element(By.CSS_SELECTOR, "input[type='text']") self.human_type(username_field, username) self.human_delay(0.5, 1) if self._stop_requested: return False # Find and fill password field password_field = self.driver.find_element(By.CSS_SELECTOR, "input[type='password']") self.human_type(password_field, password) self.human_delay(0.5, 1.5) if self._stop_requested: return False # Submit the form submit_button = self.driver.find_element(By.CSS_SELECTOR, "input[type='submit']") submit_button.click() self.human_delay(2, 4) # Check if login was successful (basic check) # You could enhance this by checking for specific elements that appear after login current_url = self.driver.current_url login_successful = "komerin" not in current_url if login_successful: self._emit_progress("login_success", {"username": username}) else: self._emit_progress("login_failed", {"username": username, "error": "Login appears to have failed"}) return login_successful except Exception as e: self._emit_progress("login_failed", {"username": username, "error": str(e)}) return False def trigger_download(self, url): """ Open URL in new tab to trigger browser download. Args: url (str): URL of file to download Returns: bool: True if download triggered successfully """ if self._stop_requested: return False try: # Store current window handle current_window = self.driver.current_window_handle # Use JavaScript to open URL in new tab with same session self.driver.execute_script(f"window.open('{url}', '_blank');") # Wait for download to complete and tab to auto-close self.human_delay(3, 5) # Switch back to original window self.driver.switch_to.window(current_window) self._emit_progress("download_triggered", {"url": url}) return True except Exception as e: self._emit_progress("download_failed", {"url": url, "error": str(e)}) return False def scrape(self, start_page=1, end_page=1): """ Scrape comics from specified page range. Args: start_page (int): Starting page number end_page (int): Ending page number Returns: dict: Summary of scraping results """ if self._stop_requested: return {"success": False, "reason": "Cancelled before starting"} # Determine base URL and URL pattern based on scraping mode if self.scraping_mode == 1: # Latest Comics base_url = "https://eboek.info/laatste" mode_name = "Latest Comics" else: # All Comics (default) base_url = "https://eboek.info/stripverhalen-alle" mode_name = "All Comics" total_pages = end_page - start_page + 1 total_comics_processed = 0 total_downloads_triggered = 0 errors = [] self._emit_progress("scraping_started", { "start_page": start_page, "end_page": end_page, "total_pages": total_pages, "mode": mode_name }) for page_num in range(start_page, end_page + 1): if self._stop_requested: break # Construct page URL based on scraping mode if self.scraping_mode == 1: # Latest Comics page_url = f"{base_url}?_page={page_num}&ref=dw" else: # All Comics if page_num == 1: page_url = base_url else: page_url = f"{base_url}/page/{page_num}/" current_page_index = page_num - start_page + 1 self._emit_progress("page_started", { "page_number": page_num, "page_index": current_page_index, "total_pages": total_pages, "url": page_url }) # Navigate to the page if not self.navigate(page_url): continue # Scroll down a bit like a human would to see content self.driver.execute_script("window.scrollTo(0, 300)") self.human_delay(1, 2) if self._stop_requested: break try: # Find all comic strip links using mode-specific CSS selectors if self.scraping_mode == 1: # Latest Comics page # For "laatste" page - target only title links to avoid duplicates comic_links = self.driver.find_elements(By.CSS_SELECTOR, '.pt-cv-wrapper .pt-cv-ifield h5.pt-cv-title a') else: # All Comics page (default) # For "stripverhalen-alle" page - original selector comic_links = self.driver.find_elements(By.CSS_SELECTOR, 'h2.post-title a') comic_count = len(comic_links) self._emit_progress("page_comics_found", { "page_number": page_num, "comic_count": comic_count }) # Store URLs first to avoid stale element issues comic_urls = [link.get_attribute('href') for link in comic_links] # Take a break between pages (more likely and longer) if page_num > start_page: if random.random() < 0.7: # 70% chance of break break_time = random.uniform(15, 45) # 15-45 seconds self._emit_progress("page_break_started", { "duration": break_time, "page_number": page_num }) time.sleep(break_time) else: # Even if no long break, always pause a bit short_break = random.uniform(5, 10) self._emit_progress("short_break", { "duration": short_break, "page_number": page_num }) time.sleep(short_break) # Process all comics on this page for i, url in enumerate(comic_urls, 1): if self._stop_requested: break self._emit_progress("comic_started", { "page_number": page_num, "comic_index": i, "total_comics": comic_count, "url": url }) # Random chance to scroll on main page before clicking if random.random() < 0.4: scroll_amount = random.randint(100, 500) self.driver.execute_script(f"window.scrollBy(0, {scroll_amount})") self.human_delay(0.5, 1.5) # Open in new tab to keep main page self.driver.execute_script("window.open('');") self.driver.switch_to.window(self.driver.window_handles[-1]) try: self.driver.get(url) self.human_delay(2, 4) if self._stop_requested: break # Sometimes scroll down to see the content if random.random() < 0.6: self.driver.execute_script("window.scrollTo(0, 400)") self.human_delay(0.5, 1.5) # Extract title try: title = self.driver.find_element(By.CSS_SELECTOR, 'h1.entry-title').text except: title = f"Comic {i} on page {page_num}" self._emit_progress("comic_title_extracted", { "title": title, "url": url }) # Small delay before clicking download self.human_delay(0.8, 2) if self._stop_requested: break # Execute the downloadLinks() JavaScript function self.driver.execute_script("downloadLinks()") self.human_delay(1.5, 3) # Find all download links in the table download_links = self.driver.find_elements(By.CSS_SELECTOR, 'table a') download_count = len(download_links) self._emit_progress("download_links_found", { "title": title, "download_count": download_count }) # Trigger download for each file for j, link in enumerate(download_links): if self._stop_requested: break file_url = link.get_attribute('href') file_name = link.text.strip() self._emit_progress("download_started", { "file_name": file_name, "url": file_url, "index": j + 1, "total": download_count }) if self.trigger_download(file_url): total_downloads_triggered += 1 # Human-like delay between downloads if j < len(download_links) - 1: delay_time = random.uniform(2, 5) self._emit_progress("download_delay", { "duration": delay_time, "remaining": len(download_links) - j - 1 }) time.sleep(delay_time) total_comics_processed += 1 self._emit_progress("comic_completed", { "title": title, "downloads_triggered": download_count, "page_number": page_num, "comic_index": i }) # Take a longer break every 5 comics if i % 5 == 0 and i < len(comic_urls): break_time = random.uniform(3, 7) self._emit_progress("comic_batch_break", { "duration": break_time, "comics_processed": i }) time.sleep(break_time) except Exception as e: error_msg = f"Error processing {url}: {e}" errors.append(error_msg) self._emit_progress("comic_error", { "url": url, "error": str(e) }) # Human would pause after an error self.human_delay(2, 4) # Close tab and switch back try: self.driver.close() self.driver.switch_to.window(self.driver.window_handles[0]) except: # Handle case where tab might have closed itself if len(self.driver.window_handles) > 0: self.driver.switch_to.window(self.driver.window_handles[0]) # Vary the delay between comics self.human_delay(1, 3) self._emit_progress("page_completed", { "page_number": page_num, "comics_processed": len(comic_urls) }) except Exception as e: error_msg = f"Error processing page {page_num}: {e}" errors.append(error_msg) self._emit_progress("page_error", { "page_number": page_num, "error": str(e) }) # Generate summary summary = { "success": not self._stop_requested, "total_pages_processed": min(page_num - start_page + 1, total_pages) if 'page_num' in locals() else 0, "total_comics_processed": total_comics_processed, "total_downloads_triggered": total_downloads_triggered, "errors": errors, "cancelled": self._stop_requested } self._emit_progress("scraping_completed", summary) return summary def close(self): """Close the browser and clean up resources.""" try: self.driver.quit() self._emit_progress("scraper_closed", {}) except Exception as e: self._emit_progress("scraper_close_error", {"error": str(e)})