eboek.info-scraper/main.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import time
import random
import os

# Disable SSL verification warnings and errors
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

class Scraper:
    def __init__(self, headless=False):
        chrome_options = Options()
        if headless:
            chrome_options.add_argument('--headless')

        # Fix SSL and certificate issues
        chrome_options.add_argument('--ignore-ssl-errors')
        chrome_options.add_argument('--ignore-certificate-errors')
        chrome_options.add_argument('--disable-web-security')
        chrome_options.add_argument('--allow-running-insecure-content')
        chrome_options.add_argument('--disable-extensions')

        # Fix DevTools connection issues
        chrome_options.add_argument('--remote-debugging-port=0')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--no-sandbox')

        # Make it look more human
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

        # Suppress logging
        chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument('--disable-logging')
        chrome_options.add_argument('--log-level=3')

        self.driver = webdriver.Chrome(options=chrome_options)
        self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

    def human_delay(self, min_sec=0.5, max_sec=2):
        time.sleep(random.uniform(min_sec, max_sec))

    def human_type(self, element, text):
        for char in text:
            element.send_keys(char)
            time.sleep(random.uniform(0.05, 0.15))

    def navigate(self, url):
        self.driver.get(url)
        self.human_delay(1, 3)

    def login(self, username, password):
        self.driver.get("https://eboek.info/komerin")
        self.human_delay(2, 4)

        # Direct selectors based on what worked
        username_field = self.driver.find_element(By.CSS_SELECTOR, "input[type='text']")
        self.human_type(username_field, username)

        self.human_delay(0.5, 1)

        password_field = self.driver.find_element(By.CSS_SELECTOR, "input[type='password']")
        self.human_type(password_field, password)

        self.human_delay(0.5, 1.5)

        submit_button = self.driver.find_element(By.CSS_SELECTOR, "input[type='submit']")
        submit_button.click()

        self.human_delay(2, 4)

    def trigger_download(self, url):
        """Open URL in new tab to trigger browser download"""
        # Store current window handle
        current_window = self.driver.current_window_handle

        # Use JavaScript to open URL in new tab with same session
        self.driver.execute_script(f"window.open('{url}', '_blank');")

        # Wait for download to complete and tab to auto-close
        self.human_delay(3, 5)

        # Switch back to original window
        self.driver.switch_to.window(current_window)

        print(f"Download triggered for: {url}")

    def scrape(self, start_page=1, end_page=1):
        """Scrape comics from specified page range"""
        base_url = "https://eboek.info/stripverhalen-alle"

        for page_num in range(start_page, end_page + 1):
            # Construct page URL
            if page_num == 1:
                page_url = base_url
            else:
                page_url = f"{base_url}/page/{page_num}/"

            print(f"\n{'='*50}")
            print(f"Processing page {page_num}: {page_url}")
            print(f"{'='*50}")

            # Navigate to the page
            self.navigate(page_url)

            # Scroll down a bit like a human would to see content
            self.driver.execute_script("window.scrollTo(0, 300)")
            self.human_delay(1, 2)

            # Find all comic strip links
            comic_links = self.driver.find_elements(By.CSS_SELECTOR, 'h2.post-title a')

            print(f"Found {len(comic_links)} comic strips on page {page_num}")

            # Store URLs first to avoid stale element issues
            comic_urls = [link.get_attribute('href') for link in comic_links]

            # Take a break between pages (more likely and longer)
            if page_num > start_page:
                if random.random() < 0.7:  # 70% chance of break
                    break_time = random.uniform(15, 45)  # 15-45 seconds
                    print(f"\nTaking a break between pages for {break_time:.1f} seconds...")
                    time.sleep(break_time)
                else:
                    # Even if no long break, always pause a bit
                    short_break = random.uniform(5, 10)
                    print(f"\nQuick pause for {short_break:.1f} seconds...")
                    time.sleep(short_break)

            # Process all comics on this page
            for i, url in enumerate(comic_urls, 1):
                print(f"\nProcessing comic {i}/{len(comic_urls)} on page {page_num}: {url}")

                # Random chance to scroll on main page before clicking
                if random.random() < 0.4:
                    scroll_amount = random.randint(100, 500)
                    self.driver.execute_script(f"window.scrollBy(0, {scroll_amount})")
                    self.human_delay(0.5, 1.5)

                # Open in new tab to keep main page
                self.driver.execute_script("window.open('');")
                self.driver.switch_to.window(self.driver.window_handles[-1])

                try:
                    self.driver.get(url)
                    self.human_delay(2, 4)

                    # Sometimes scroll down to see the content
                    if random.random() < 0.6:
                        self.driver.execute_script("window.scrollTo(0, 400)")
                        self.human_delay(0.5, 1.5)

                    # Extract title
                    title = self.driver.find_element(By.CSS_SELECTOR, 'h1.entry-title').text
                    print(f"Title: {title}")

                    # Small delay before clicking download
                    self.human_delay(0.8, 2)

                    # Execute the downloadLinks() JavaScript function
                    self.driver.execute_script("downloadLinks()")
                    self.human_delay(1.5, 3)

                    # Find all download links in the table
                    download_links = self.driver.find_elements(By.CSS_SELECTOR, 'table a')

                    print(f"Found {len(download_links)} download links")

                    # Trigger download for each file
                    for j, link in enumerate(download_links):
                        file_url = link.get_attribute('href')
                        file_name = link.text.strip()

                        print(f"Triggering download: {file_name}")
                        self.trigger_download(file_url)

                        # Human-like delay between downloads
                        if j < len(download_links) - 1:
                            # Longer delay for multiple downloads (human would wait and check)
                            delay_time = random.uniform(2, 5)
                            print(f"Waiting {delay_time:.1f} seconds before next download...")
                            time.sleep(delay_time)

                    # Take a longer break every 5 comics
                    if i % 5 == 0 and i < len(comic_urls):
                        break_time = random.uniform(3, 7)
                        print(f"\nTaking a break for {break_time:.1f} seconds...")
                        time.sleep(break_time)

                except Exception as e:
                    print(f"Error processing {url}: {e}")
                    # Human would pause after an error
                    self.human_delay(2, 4)

                # Close tab and switch back
                self.driver.close()
                self.driver.switch_to.window(self.driver.window_handles[0])

                # Vary the delay between comics
                self.human_delay(1, 3)

    def close(self):
        self.driver.quit()

def update_credentials_in_file(username, password):
    """Update the credentials in this file for future use"""
    # Read the current file
    with open(__file__, 'r', encoding='utf-8') as f:
        content = f.read()

    # Replace the placeholder credentials
    content = content.replace('scraper.login("xxx", "yyy")',
                             f'scraper.login("{username}", "{password}")')

    # Write back to file
    with open(__file__, 'w', encoding='utf-8') as f:
        f.write(content)

    print("Credentials saved for future use!")

if __name__ == "__main__":
    # Check if credentials need to be set
    username = "jouw_gebruikersnaam"
    password = "jouw_wachtwoord"

    if username == "jouw_gebruikersnaam" or password == "jouw_wachtwoord":
        print("First time setup: Please enter your EBoek.info credentials")
        new_username = input("Username: ")
        new_password = input("Password: ")

        # Update the file with new credentials
        update_credentials_in_file(new_username, new_password)

        # Use the new credentials
        username = new_username
        password = new_password

    scraper = Scraper()

    # Login first
    scraper.login(username, password)

    # Ask which page(s) to scrape
    start = int(input("Enter start page number (1 for first page): "))
    end = int(input("Enter end page number (same as start for single page): "))

    # Scrape the specified pages
    scraper.scrape(start_page=start, end_page=end)

    # Keep browser open
    input("\nDone! Press Enter to close the browser...")
    scraper.close()