258 lines
11 KiB
Python
258 lines
11 KiB
Python
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.common.keys import Keys
|
|
from selenium.webdriver.chrome.options import Options
|
|
import time
|
|
import random
|
|
import os
|
|
|
|
# Disable SSL verification warnings and errors
|
|
import urllib3
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
|
|
class Scraper:
|
|
def __init__(self, headless=False):
|
|
chrome_options = Options()
|
|
if headless:
|
|
chrome_options.add_argument('--headless')
|
|
|
|
# Fix SSL and certificate issues
|
|
chrome_options.add_argument('--ignore-ssl-errors')
|
|
chrome_options.add_argument('--ignore-certificate-errors')
|
|
chrome_options.add_argument('--disable-web-security')
|
|
chrome_options.add_argument('--allow-running-insecure-content')
|
|
chrome_options.add_argument('--disable-extensions')
|
|
|
|
# Fix DevTools connection issues
|
|
chrome_options.add_argument('--remote-debugging-port=0')
|
|
chrome_options.add_argument('--disable-dev-shm-usage')
|
|
chrome_options.add_argument('--no-sandbox')
|
|
|
|
# Make it look more human
|
|
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
|
|
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
|
chrome_options.add_experimental_option('useAutomationExtension', False)
|
|
chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
|
|
|
|
# Suppress logging
|
|
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
|
|
chrome_options.add_experimental_option('useAutomationExtension', False)
|
|
chrome_options.add_argument('--disable-logging')
|
|
chrome_options.add_argument('--log-level=3')
|
|
|
|
self.driver = webdriver.Chrome(options=chrome_options)
|
|
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
|
|
|
def human_delay(self, min_sec=0.5, max_sec=2):
|
|
time.sleep(random.uniform(min_sec, max_sec))
|
|
|
|
def human_type(self, element, text):
|
|
for char in text:
|
|
element.send_keys(char)
|
|
time.sleep(random.uniform(0.05, 0.15))
|
|
|
|
def navigate(self, url):
|
|
self.driver.get(url)
|
|
self.human_delay(1, 3)
|
|
|
|
def login(self, username, password):
|
|
self.driver.get("https://eboek.info/komerin")
|
|
self.human_delay(2, 4)
|
|
|
|
# Direct selectors based on what worked
|
|
username_field = self.driver.find_element(By.CSS_SELECTOR, "input[type='text']")
|
|
self.human_type(username_field, username)
|
|
|
|
self.human_delay(0.5, 1)
|
|
|
|
password_field = self.driver.find_element(By.CSS_SELECTOR, "input[type='password']")
|
|
self.human_type(password_field, password)
|
|
|
|
self.human_delay(0.5, 1.5)
|
|
|
|
submit_button = self.driver.find_element(By.CSS_SELECTOR, "input[type='submit']")
|
|
submit_button.click()
|
|
|
|
self.human_delay(2, 4)
|
|
|
|
def trigger_download(self, url):
|
|
"""Open URL in new tab to trigger browser download"""
|
|
# Store current window handle
|
|
current_window = self.driver.current_window_handle
|
|
|
|
# Use JavaScript to open URL in new tab with same session
|
|
self.driver.execute_script(f"window.open('{url}', '_blank');")
|
|
|
|
# Wait for download to complete and tab to auto-close
|
|
self.human_delay(3, 5)
|
|
|
|
# Switch back to original window
|
|
self.driver.switch_to.window(current_window)
|
|
|
|
print(f"Download triggered for: {url}")
|
|
|
|
def scrape(self, start_page=1, end_page=1):
|
|
"""Scrape comics from specified page range"""
|
|
base_url = "https://eboek.info/stripverhalen-alle"
|
|
|
|
for page_num in range(start_page, end_page + 1):
|
|
# Construct page URL
|
|
if page_num == 1:
|
|
page_url = base_url
|
|
else:
|
|
page_url = f"{base_url}/page/{page_num}/"
|
|
|
|
print(f"\n{'='*50}")
|
|
print(f"Processing page {page_num}: {page_url}")
|
|
print(f"{'='*50}")
|
|
|
|
# Navigate to the page
|
|
self.navigate(page_url)
|
|
|
|
# Scroll down a bit like a human would to see content
|
|
self.driver.execute_script("window.scrollTo(0, 300)")
|
|
self.human_delay(1, 2)
|
|
|
|
# Find all comic strip links
|
|
comic_links = self.driver.find_elements(By.CSS_SELECTOR, 'h2.post-title a')
|
|
|
|
print(f"Found {len(comic_links)} comic strips on page {page_num}")
|
|
|
|
# Store URLs first to avoid stale element issues
|
|
comic_urls = [link.get_attribute('href') for link in comic_links]
|
|
|
|
# Take a break between pages (more likely and longer)
|
|
if page_num > start_page:
|
|
if random.random() < 0.7: # 70% chance of break
|
|
break_time = random.uniform(15, 45) # 15-45 seconds
|
|
print(f"\nTaking a break between pages for {break_time:.1f} seconds...")
|
|
time.sleep(break_time)
|
|
else:
|
|
# Even if no long break, always pause a bit
|
|
short_break = random.uniform(5, 10)
|
|
print(f"\nQuick pause for {short_break:.1f} seconds...")
|
|
time.sleep(short_break)
|
|
|
|
# Process all comics on this page
|
|
for i, url in enumerate(comic_urls, 1):
|
|
print(f"\nProcessing comic {i}/{len(comic_urls)} on page {page_num}: {url}")
|
|
|
|
# Random chance to scroll on main page before clicking
|
|
if random.random() < 0.4:
|
|
scroll_amount = random.randint(100, 500)
|
|
self.driver.execute_script(f"window.scrollBy(0, {scroll_amount})")
|
|
self.human_delay(0.5, 1.5)
|
|
|
|
# Open in new tab to keep main page
|
|
self.driver.execute_script("window.open('');")
|
|
self.driver.switch_to.window(self.driver.window_handles[-1])
|
|
|
|
try:
|
|
self.driver.get(url)
|
|
self.human_delay(2, 4)
|
|
|
|
# Sometimes scroll down to see the content
|
|
if random.random() < 0.6:
|
|
self.driver.execute_script("window.scrollTo(0, 400)")
|
|
self.human_delay(0.5, 1.5)
|
|
|
|
# Extract title
|
|
title = self.driver.find_element(By.CSS_SELECTOR, 'h1.entry-title').text
|
|
print(f"Title: {title}")
|
|
|
|
# Small delay before clicking download
|
|
self.human_delay(0.8, 2)
|
|
|
|
# Execute the downloadLinks() JavaScript function
|
|
self.driver.execute_script("downloadLinks()")
|
|
self.human_delay(1.5, 3)
|
|
|
|
# Find all download links in the table
|
|
download_links = self.driver.find_elements(By.CSS_SELECTOR, 'table a')
|
|
|
|
print(f"Found {len(download_links)} download links")
|
|
|
|
# Trigger download for each file
|
|
for j, link in enumerate(download_links):
|
|
file_url = link.get_attribute('href')
|
|
file_name = link.text.strip()
|
|
|
|
print(f"Triggering download: {file_name}")
|
|
self.trigger_download(file_url)
|
|
|
|
# Human-like delay between downloads
|
|
if j < len(download_links) - 1:
|
|
# Longer delay for multiple downloads (human would wait and check)
|
|
delay_time = random.uniform(2, 5)
|
|
print(f"Waiting {delay_time:.1f} seconds before next download...")
|
|
time.sleep(delay_time)
|
|
|
|
# Take a longer break every 5 comics
|
|
if i % 5 == 0 and i < len(comic_urls):
|
|
break_time = random.uniform(3, 7)
|
|
print(f"\nTaking a break for {break_time:.1f} seconds...")
|
|
time.sleep(break_time)
|
|
|
|
except Exception as e:
|
|
print(f"Error processing {url}: {e}")
|
|
# Human would pause after an error
|
|
self.human_delay(2, 4)
|
|
|
|
# Close tab and switch back
|
|
self.driver.close()
|
|
self.driver.switch_to.window(self.driver.window_handles[0])
|
|
|
|
# Vary the delay between comics
|
|
self.human_delay(1, 3)
|
|
|
|
def close(self):
|
|
self.driver.quit()
|
|
|
|
def update_credentials_in_file(username, password):
|
|
"""Update the credentials in this file for future use"""
|
|
# Read the current file
|
|
with open(__file__, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Replace the placeholder credentials
|
|
content = content.replace('scraper.login("xxx", "yyy")',
|
|
f'scraper.login("{username}", "{password}")')
|
|
|
|
# Write back to file
|
|
with open(__file__, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
print("Credentials saved for future use!")
|
|
|
|
if __name__ == "__main__":
|
|
# Check if credentials need to be set
|
|
username = "jouw_gebruikersnaam"
|
|
password = "jouw_wachtwoord"
|
|
|
|
if username == "jouw_gebruikersnaam" or password == "jouw_wachtwoord":
|
|
print("First time setup: Please enter your EBoek.info credentials")
|
|
new_username = input("Username: ")
|
|
new_password = input("Password: ")
|
|
|
|
# Update the file with new credentials
|
|
update_credentials_in_file(new_username, new_password)
|
|
|
|
# Use the new credentials
|
|
username = new_username
|
|
password = new_password
|
|
|
|
scraper = Scraper()
|
|
|
|
# Login first
|
|
scraper.login(username, password)
|
|
|
|
# Ask which page(s) to scrape
|
|
start = int(input("Enter start page number (1 for first page): "))
|
|
end = int(input("Enter end page number (same as start for single page): "))
|
|
|
|
# Scrape the specified pages
|
|
scraper.scrape(start_page=start, end_page=end)
|
|
|
|
# Keep browser open
|
|
input("\nDone! Press Enter to close the browser...")
|
|
scraper.close() |