feat: Add installation scripts for Windows and Unix-based systems
- Created `install_and_run.bat` for Windows installation and setup. - Created `install_and_run.sh` for Unix-based systems installation and setup. - Removed `main.py` as it is no longer needed. - Updated `requirements.txt` to specify package versions and added PyQt5. - Deleted `start.bat` as it is redundant. - Added unit tests for core functionality and scraping modes. - Implemented input validation utilities in `utils/validators.py`. - Added support for dual scraping modes in the scraper.
This commit is contained in:
1
core/__init__.py
Normal file
1
core/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
# Core scraping functionality
|
||||
309
core/credentials.py
Normal file
309
core/credentials.py
Normal file
@@ -0,0 +1,309 @@
|
||||
"""
|
||||
Simple JSON-based credential storage system for EBoek.info scraper.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
import stat
|
||||
|
||||
|
||||
class CredentialManager:
|
||||
"""
|
||||
Manages storage and retrieval of user credentials in a JSON config file.
|
||||
|
||||
Credentials are stored in the user's home directory in a hidden folder
|
||||
with appropriate file permissions for basic security.
|
||||
"""
|
||||
|
||||
def __init__(self, app_name="eboek_scraper"):
|
||||
"""
|
||||
Initialize the credential manager.
|
||||
|
||||
Args:
|
||||
app_name (str): Application name for config directory
|
||||
"""
|
||||
self.app_name = app_name
|
||||
self.config_dir = Path.home() / f".{app_name}"
|
||||
self.config_file = self.config_dir / "config.json"
|
||||
self._ensure_config_dir()
|
||||
|
||||
def _ensure_config_dir(self):
|
||||
"""
|
||||
Ensure the configuration directory exists with appropriate permissions.
|
||||
"""
|
||||
try:
|
||||
if not self.config_dir.exists():
|
||||
self.config_dir.mkdir(mode=0o700, exist_ok=True) # Only user can read/write/execute
|
||||
|
||||
# Ensure directory has correct permissions (user only)
|
||||
if os.name != 'nt': # Unix-like systems (macOS, Linux)
|
||||
os.chmod(self.config_dir, stat.S_IRWXU) # 700 permissions
|
||||
|
||||
except Exception as e:
|
||||
# If we can't create the config directory, fall back to current directory
|
||||
self.config_dir = Path(".")
|
||||
self.config_file = self.config_dir / f".{self.app_name}_config.json"
|
||||
|
||||
def _load_config(self):
|
||||
"""
|
||||
Load the configuration file.
|
||||
|
||||
Returns:
|
||||
dict: Configuration data, empty dict if file doesn't exist
|
||||
"""
|
||||
try:
|
||||
if self.config_file.exists():
|
||||
with open(self.config_file, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except (json.JSONDecodeError, IOError, PermissionError) as e:
|
||||
# If there's any error reading the config, return empty dict
|
||||
pass
|
||||
|
||||
return {}
|
||||
|
||||
def _save_config(self, config_data):
|
||||
"""
|
||||
Save configuration data to file.
|
||||
|
||||
Args:
|
||||
config_data (dict): Configuration data to save
|
||||
|
||||
Returns:
|
||||
bool: True if saved successfully, False otherwise
|
||||
"""
|
||||
try:
|
||||
with open(self.config_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(config_data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# Set file permissions to be readable/writable by user only
|
||||
if os.name != 'nt': # Unix-like systems
|
||||
os.chmod(self.config_file, stat.S_IRUSR | stat.S_IWUSR) # 600 permissions
|
||||
|
||||
return True
|
||||
|
||||
except (IOError, PermissionError) as e:
|
||||
return False
|
||||
|
||||
def save_credentials(self, username, password, remember=True):
|
||||
"""
|
||||
Save user credentials to the config file.
|
||||
|
||||
Args:
|
||||
username (str): EBoek.info username
|
||||
password (str): EBoek.info password
|
||||
remember (bool): Whether to save credentials for future use
|
||||
|
||||
Returns:
|
||||
bool: True if saved successfully, False otherwise
|
||||
"""
|
||||
if not remember:
|
||||
# If remember is False, just clear any existing credentials
|
||||
return self.clear_credentials()
|
||||
|
||||
try:
|
||||
config = self._load_config()
|
||||
|
||||
config['credentials'] = {
|
||||
'username': username,
|
||||
'password': password,
|
||||
'saved_at': str(Path.home()), # Just to know which user saved it
|
||||
}
|
||||
|
||||
return self._save_config(config)
|
||||
|
||||
except Exception as e:
|
||||
return False
|
||||
|
||||
def load_credentials(self):
|
||||
"""
|
||||
Load stored credentials.
|
||||
|
||||
Returns:
|
||||
dict or None: Dictionary with 'username' and 'password' keys if found,
|
||||
None if no credentials are stored
|
||||
"""
|
||||
try:
|
||||
config = self._load_config()
|
||||
credentials = config.get('credentials')
|
||||
|
||||
if credentials and 'username' in credentials and 'password' in credentials:
|
||||
return {
|
||||
'username': credentials['username'],
|
||||
'password': credentials['password']
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def has_saved_credentials(self):
|
||||
"""
|
||||
Check if there are saved credentials available.
|
||||
|
||||
Returns:
|
||||
bool: True if credentials are available, False otherwise
|
||||
"""
|
||||
return self.load_credentials() is not None
|
||||
|
||||
def get_saved_username(self):
|
||||
"""
|
||||
Get the saved username without the password.
|
||||
|
||||
Returns:
|
||||
str or None: Saved username if available, None otherwise
|
||||
"""
|
||||
credentials = self.load_credentials()
|
||||
return credentials['username'] if credentials else None
|
||||
|
||||
def clear_credentials(self):
|
||||
"""
|
||||
Remove stored credentials from the config file.
|
||||
|
||||
Returns:
|
||||
bool: True if cleared successfully, False otherwise
|
||||
"""
|
||||
try:
|
||||
config = self._load_config()
|
||||
|
||||
if 'credentials' in config:
|
||||
del config['credentials']
|
||||
return self._save_config(config)
|
||||
|
||||
return True # No credentials to clear is success
|
||||
|
||||
except Exception as e:
|
||||
return False
|
||||
|
||||
def validate_credentials(self, username, password):
|
||||
"""
|
||||
Basic validation of credential format.
|
||||
|
||||
Args:
|
||||
username (str): Username to validate
|
||||
password (str): Password to validate
|
||||
|
||||
Returns:
|
||||
dict: Validation result with 'valid' bool and 'errors' list
|
||||
"""
|
||||
errors = []
|
||||
|
||||
if not username or not username.strip():
|
||||
errors.append("Username cannot be empty")
|
||||
elif len(username.strip()) < 2:
|
||||
errors.append("Username must be at least 2 characters")
|
||||
|
||||
if not password or not password.strip():
|
||||
errors.append("Password cannot be empty")
|
||||
elif len(password) < 3:
|
||||
errors.append("Password must be at least 3 characters")
|
||||
|
||||
return {
|
||||
'valid': len(errors) == 0,
|
||||
'errors': errors
|
||||
}
|
||||
|
||||
def get_config_file_path(self):
|
||||
"""
|
||||
Get the path to the configuration file.
|
||||
|
||||
Returns:
|
||||
Path: Path to the config file
|
||||
"""
|
||||
return self.config_file
|
||||
|
||||
def save_app_settings(self, settings):
|
||||
"""
|
||||
Save application settings (non-credential settings).
|
||||
|
||||
Args:
|
||||
settings (dict): Application settings to save
|
||||
|
||||
Returns:
|
||||
bool: True if saved successfully, False otherwise
|
||||
"""
|
||||
try:
|
||||
config = self._load_config()
|
||||
config['app_settings'] = settings
|
||||
return self._save_config(config)
|
||||
except Exception as e:
|
||||
return False
|
||||
|
||||
def load_app_settings(self):
|
||||
"""
|
||||
Load application settings (non-credential settings).
|
||||
|
||||
Returns:
|
||||
dict: Application settings, empty dict if none saved
|
||||
"""
|
||||
try:
|
||||
config = self._load_config()
|
||||
return config.get('app_settings', {})
|
||||
except Exception as e:
|
||||
return {}
|
||||
|
||||
def get_default_settings(self):
|
||||
"""
|
||||
Get default application settings.
|
||||
|
||||
Returns:
|
||||
dict: Default settings
|
||||
"""
|
||||
return {
|
||||
'headless_mode': True,
|
||||
'verbose_logging': False,
|
||||
'auto_save_credentials': True,
|
||||
'download_path': str(Path.home() / "Downloads"),
|
||||
'default_start_page': 1,
|
||||
'default_end_page': 1,
|
||||
'scraping_mode': 0 # 0=All Comics, 1=Latest Comics
|
||||
}
|
||||
|
||||
def export_settings(self, export_path):
|
||||
"""
|
||||
Export settings (excluding credentials) to a file.
|
||||
|
||||
Args:
|
||||
export_path (str or Path): Path to export settings to
|
||||
|
||||
Returns:
|
||||
bool: True if exported successfully, False otherwise
|
||||
"""
|
||||
try:
|
||||
config = self._load_config()
|
||||
# Remove credentials from export
|
||||
export_config = {k: v for k, v in config.items() if k != 'credentials'}
|
||||
|
||||
with open(export_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(export_config, f, indent=2, ensure_ascii=False)
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
return False
|
||||
|
||||
def import_settings(self, import_path):
|
||||
"""
|
||||
Import settings (excluding credentials) from a file.
|
||||
|
||||
Args:
|
||||
import_path (str or Path): Path to import settings from
|
||||
|
||||
Returns:
|
||||
bool: True if imported successfully, False otherwise
|
||||
"""
|
||||
try:
|
||||
with open(import_path, 'r', encoding='utf-8') as f:
|
||||
imported_config = json.load(f)
|
||||
|
||||
# Don't import credentials for security
|
||||
if 'credentials' in imported_config:
|
||||
del imported_config['credentials']
|
||||
|
||||
# Merge with existing config
|
||||
config = self._load_config()
|
||||
config.update(imported_config)
|
||||
|
||||
return self._save_config(config)
|
||||
except Exception as e:
|
||||
return False
|
||||
513
core/scraper.py
Normal file
513
core/scraper.py
Normal file
@@ -0,0 +1,513 @@
|
||||
"""
|
||||
Core scraper functionality extracted from main.py with callback support for GUI integration.
|
||||
"""
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.common.keys import Keys
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
import time
|
||||
import random
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Disable SSL verification warnings and errors
|
||||
import urllib3
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
|
||||
class Scraper:
|
||||
"""
|
||||
EBoek.info web scraper with GUI callback support.
|
||||
|
||||
This class handles the core scraping functionality while providing
|
||||
callback mechanisms for progress updates to a GUI application.
|
||||
"""
|
||||
|
||||
def __init__(self, headless=False, progress_callback=None, scraping_mode=0):
|
||||
"""
|
||||
Initialize the scraper with optional GUI callback support.
|
||||
|
||||
Args:
|
||||
headless (bool): Whether to run Chrome in headless mode
|
||||
progress_callback (callable): Optional callback function for progress updates
|
||||
Callback signature: callback(event_type: str, data: dict)
|
||||
scraping_mode (int): Scraping mode (0=All Comics, 1=Latest Comics)
|
||||
"""
|
||||
self.progress_callback = progress_callback
|
||||
self._stop_requested = False
|
||||
self.scraping_mode = scraping_mode
|
||||
|
||||
# Set up Chrome options with anti-detection measures
|
||||
chrome_options = Options()
|
||||
if headless:
|
||||
chrome_options.add_argument('--headless')
|
||||
|
||||
# Fix SSL and certificate issues
|
||||
chrome_options.add_argument('--ignore-ssl-errors')
|
||||
chrome_options.add_argument('--ignore-certificate-errors')
|
||||
chrome_options.add_argument('--disable-web-security')
|
||||
chrome_options.add_argument('--allow-running-insecure-content')
|
||||
chrome_options.add_argument('--disable-extensions')
|
||||
|
||||
# Fix DevTools connection issues
|
||||
chrome_options.add_argument('--remote-debugging-port=0')
|
||||
chrome_options.add_argument('--disable-dev-shm-usage')
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
|
||||
# Make it look more human
|
||||
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
|
||||
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
||||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||
chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
|
||||
|
||||
# Suppress logging
|
||||
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
|
||||
chrome_options.add_experimental_option('useAutomationExtension', False)
|
||||
chrome_options.add_argument('--disable-logging')
|
||||
chrome_options.add_argument('--log-level=3')
|
||||
|
||||
# Set cross-platform download directory
|
||||
downloads_path = str(Path.home() / "Downloads")
|
||||
prefs = {
|
||||
"download.default_directory": downloads_path,
|
||||
"download.prompt_for_download": False,
|
||||
"download.directory_upgrade": True,
|
||||
"safebrowsing.enabled": True
|
||||
}
|
||||
chrome_options.add_experimental_option("prefs", prefs)
|
||||
|
||||
self.driver = webdriver.Chrome(options=chrome_options)
|
||||
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
||||
|
||||
self._emit_progress("scraper_initialized", {"headless": headless, "downloads_path": downloads_path})
|
||||
|
||||
def _emit_progress(self, event_type, data):
|
||||
"""
|
||||
Internal method to emit progress updates via callback.
|
||||
|
||||
Args:
|
||||
event_type (str): Type of event (e.g., 'page_started', 'comic_completed')
|
||||
data (dict): Event data
|
||||
"""
|
||||
if self.progress_callback:
|
||||
try:
|
||||
self.progress_callback(event_type, data)
|
||||
except Exception as e:
|
||||
# Don't let callback errors crash the scraper
|
||||
pass
|
||||
|
||||
def request_stop(self):
|
||||
"""Request the scraper to stop gracefully at the next opportunity."""
|
||||
self._stop_requested = True
|
||||
self._emit_progress("stop_requested", {})
|
||||
|
||||
def human_delay(self, min_sec=0.5, max_sec=2):
|
||||
"""
|
||||
Simulate human-like delay with cancellation support.
|
||||
|
||||
Args:
|
||||
min_sec (float): Minimum delay time
|
||||
max_sec (float): Maximum delay time
|
||||
"""
|
||||
if self._stop_requested:
|
||||
return
|
||||
delay_time = random.uniform(min_sec, max_sec)
|
||||
self._emit_progress("delay_started", {"duration": delay_time})
|
||||
time.sleep(delay_time)
|
||||
|
||||
def human_type(self, element, text):
|
||||
"""
|
||||
Type text character by character with human-like delays.
|
||||
|
||||
Args:
|
||||
element: Selenium web element to type into
|
||||
text (str): Text to type
|
||||
"""
|
||||
for char in text:
|
||||
if self._stop_requested:
|
||||
return
|
||||
element.send_keys(char)
|
||||
time.sleep(random.uniform(0.05, 0.15))
|
||||
|
||||
def navigate(self, url):
|
||||
"""
|
||||
Navigate to a URL with human-like delay.
|
||||
|
||||
Args:
|
||||
url (str): URL to navigate to
|
||||
"""
|
||||
if self._stop_requested:
|
||||
return False
|
||||
|
||||
self._emit_progress("navigation_started", {"url": url})
|
||||
self.driver.get(url)
|
||||
self.human_delay(1, 3)
|
||||
self._emit_progress("navigation_completed", {"url": url})
|
||||
return True
|
||||
|
||||
def login(self, username, password):
|
||||
"""
|
||||
Login to EBoek.info with provided credentials.
|
||||
|
||||
Args:
|
||||
username (str): Username for login
|
||||
password (str): Password for login
|
||||
|
||||
Returns:
|
||||
bool: True if login successful, False otherwise
|
||||
"""
|
||||
if self._stop_requested:
|
||||
return False
|
||||
|
||||
self._emit_progress("login_started", {"username": username})
|
||||
|
||||
try:
|
||||
self.driver.get("https://eboek.info/komerin")
|
||||
self.human_delay(2, 4)
|
||||
|
||||
if self._stop_requested:
|
||||
return False
|
||||
|
||||
# Find and fill username field
|
||||
username_field = self.driver.find_element(By.CSS_SELECTOR, "input[type='text']")
|
||||
self.human_type(username_field, username)
|
||||
|
||||
self.human_delay(0.5, 1)
|
||||
|
||||
if self._stop_requested:
|
||||
return False
|
||||
|
||||
# Find and fill password field
|
||||
password_field = self.driver.find_element(By.CSS_SELECTOR, "input[type='password']")
|
||||
self.human_type(password_field, password)
|
||||
|
||||
self.human_delay(0.5, 1.5)
|
||||
|
||||
if self._stop_requested:
|
||||
return False
|
||||
|
||||
# Submit the form
|
||||
submit_button = self.driver.find_element(By.CSS_SELECTOR, "input[type='submit']")
|
||||
submit_button.click()
|
||||
|
||||
self.human_delay(2, 4)
|
||||
|
||||
# Check if login was successful (basic check)
|
||||
# You could enhance this by checking for specific elements that appear after login
|
||||
current_url = self.driver.current_url
|
||||
login_successful = "komerin" not in current_url
|
||||
|
||||
if login_successful:
|
||||
self._emit_progress("login_success", {"username": username})
|
||||
else:
|
||||
self._emit_progress("login_failed", {"username": username, "error": "Login appears to have failed"})
|
||||
|
||||
return login_successful
|
||||
|
||||
except Exception as e:
|
||||
self._emit_progress("login_failed", {"username": username, "error": str(e)})
|
||||
return False
|
||||
|
||||
def trigger_download(self, url):
|
||||
"""
|
||||
Open URL in new tab to trigger browser download.
|
||||
|
||||
Args:
|
||||
url (str): URL of file to download
|
||||
|
||||
Returns:
|
||||
bool: True if download triggered successfully
|
||||
"""
|
||||
if self._stop_requested:
|
||||
return False
|
||||
|
||||
try:
|
||||
# Store current window handle
|
||||
current_window = self.driver.current_window_handle
|
||||
|
||||
# Use JavaScript to open URL in new tab with same session
|
||||
self.driver.execute_script(f"window.open('{url}', '_blank');")
|
||||
|
||||
# Wait for download to complete and tab to auto-close
|
||||
self.human_delay(3, 5)
|
||||
|
||||
# Switch back to original window
|
||||
self.driver.switch_to.window(current_window)
|
||||
|
||||
self._emit_progress("download_triggered", {"url": url})
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self._emit_progress("download_failed", {"url": url, "error": str(e)})
|
||||
return False
|
||||
|
||||
def scrape(self, start_page=1, end_page=1):
|
||||
"""
|
||||
Scrape comics from specified page range.
|
||||
|
||||
Args:
|
||||
start_page (int): Starting page number
|
||||
end_page (int): Ending page number
|
||||
|
||||
Returns:
|
||||
dict: Summary of scraping results
|
||||
"""
|
||||
if self._stop_requested:
|
||||
return {"success": False, "reason": "Cancelled before starting"}
|
||||
|
||||
# Determine base URL and URL pattern based on scraping mode
|
||||
if self.scraping_mode == 1: # Latest Comics
|
||||
base_url = "https://eboek.info/laatste"
|
||||
mode_name = "Latest Comics"
|
||||
else: # All Comics (default)
|
||||
base_url = "https://eboek.info/stripverhalen-alle"
|
||||
mode_name = "All Comics"
|
||||
|
||||
total_pages = end_page - start_page + 1
|
||||
total_comics_processed = 0
|
||||
total_downloads_triggered = 0
|
||||
errors = []
|
||||
|
||||
self._emit_progress("scraping_started", {
|
||||
"start_page": start_page,
|
||||
"end_page": end_page,
|
||||
"total_pages": total_pages,
|
||||
"mode": mode_name
|
||||
})
|
||||
|
||||
for page_num in range(start_page, end_page + 1):
|
||||
if self._stop_requested:
|
||||
break
|
||||
|
||||
# Construct page URL based on scraping mode
|
||||
if self.scraping_mode == 1: # Latest Comics
|
||||
page_url = f"{base_url}?_page={page_num}&ref=dw"
|
||||
else: # All Comics
|
||||
if page_num == 1:
|
||||
page_url = base_url
|
||||
else:
|
||||
page_url = f"{base_url}/page/{page_num}/"
|
||||
|
||||
current_page_index = page_num - start_page + 1
|
||||
self._emit_progress("page_started", {
|
||||
"page_number": page_num,
|
||||
"page_index": current_page_index,
|
||||
"total_pages": total_pages,
|
||||
"url": page_url
|
||||
})
|
||||
|
||||
# Navigate to the page
|
||||
if not self.navigate(page_url):
|
||||
continue
|
||||
|
||||
# Scroll down a bit like a human would to see content
|
||||
self.driver.execute_script("window.scrollTo(0, 300)")
|
||||
self.human_delay(1, 2)
|
||||
|
||||
if self._stop_requested:
|
||||
break
|
||||
|
||||
try:
|
||||
# Find all comic strip links using mode-specific CSS selectors
|
||||
if self.scraping_mode == 1: # Latest Comics page
|
||||
# For "laatste" page - target only title links to avoid duplicates
|
||||
comic_links = self.driver.find_elements(By.CSS_SELECTOR, '.pt-cv-wrapper .pt-cv-ifield h5.pt-cv-title a')
|
||||
else: # All Comics page (default)
|
||||
# For "stripverhalen-alle" page - original selector
|
||||
comic_links = self.driver.find_elements(By.CSS_SELECTOR, 'h2.post-title a')
|
||||
|
||||
comic_count = len(comic_links)
|
||||
|
||||
self._emit_progress("page_comics_found", {
|
||||
"page_number": page_num,
|
||||
"comic_count": comic_count
|
||||
})
|
||||
|
||||
# Store URLs first to avoid stale element issues
|
||||
comic_urls = [link.get_attribute('href') for link in comic_links]
|
||||
|
||||
# Take a break between pages (more likely and longer)
|
||||
if page_num > start_page:
|
||||
if random.random() < 0.7: # 70% chance of break
|
||||
break_time = random.uniform(15, 45) # 15-45 seconds
|
||||
self._emit_progress("page_break_started", {
|
||||
"duration": break_time,
|
||||
"page_number": page_num
|
||||
})
|
||||
time.sleep(break_time)
|
||||
else:
|
||||
# Even if no long break, always pause a bit
|
||||
short_break = random.uniform(5, 10)
|
||||
self._emit_progress("short_break", {
|
||||
"duration": short_break,
|
||||
"page_number": page_num
|
||||
})
|
||||
time.sleep(short_break)
|
||||
|
||||
# Process all comics on this page
|
||||
for i, url in enumerate(comic_urls, 1):
|
||||
if self._stop_requested:
|
||||
break
|
||||
|
||||
self._emit_progress("comic_started", {
|
||||
"page_number": page_num,
|
||||
"comic_index": i,
|
||||
"total_comics": comic_count,
|
||||
"url": url
|
||||
})
|
||||
|
||||
# Random chance to scroll on main page before clicking
|
||||
if random.random() < 0.4:
|
||||
scroll_amount = random.randint(100, 500)
|
||||
self.driver.execute_script(f"window.scrollBy(0, {scroll_amount})")
|
||||
self.human_delay(0.5, 1.5)
|
||||
|
||||
# Open in new tab to keep main page
|
||||
self.driver.execute_script("window.open('');")
|
||||
self.driver.switch_to.window(self.driver.window_handles[-1])
|
||||
|
||||
try:
|
||||
self.driver.get(url)
|
||||
self.human_delay(2, 4)
|
||||
|
||||
if self._stop_requested:
|
||||
break
|
||||
|
||||
# Sometimes scroll down to see the content
|
||||
if random.random() < 0.6:
|
||||
self.driver.execute_script("window.scrollTo(0, 400)")
|
||||
self.human_delay(0.5, 1.5)
|
||||
|
||||
# Extract title
|
||||
try:
|
||||
title = self.driver.find_element(By.CSS_SELECTOR, 'h1.entry-title').text
|
||||
except:
|
||||
title = f"Comic {i} on page {page_num}"
|
||||
|
||||
self._emit_progress("comic_title_extracted", {
|
||||
"title": title,
|
||||
"url": url
|
||||
})
|
||||
|
||||
# Small delay before clicking download
|
||||
self.human_delay(0.8, 2)
|
||||
|
||||
if self._stop_requested:
|
||||
break
|
||||
|
||||
# Execute the downloadLinks() JavaScript function
|
||||
self.driver.execute_script("downloadLinks()")
|
||||
self.human_delay(1.5, 3)
|
||||
|
||||
# Find all download links in the table
|
||||
download_links = self.driver.find_elements(By.CSS_SELECTOR, 'table a')
|
||||
download_count = len(download_links)
|
||||
|
||||
self._emit_progress("download_links_found", {
|
||||
"title": title,
|
||||
"download_count": download_count
|
||||
})
|
||||
|
||||
# Trigger download for each file
|
||||
for j, link in enumerate(download_links):
|
||||
if self._stop_requested:
|
||||
break
|
||||
|
||||
file_url = link.get_attribute('href')
|
||||
file_name = link.text.strip()
|
||||
|
||||
self._emit_progress("download_started", {
|
||||
"file_name": file_name,
|
||||
"url": file_url,
|
||||
"index": j + 1,
|
||||
"total": download_count
|
||||
})
|
||||
|
||||
if self.trigger_download(file_url):
|
||||
total_downloads_triggered += 1
|
||||
|
||||
# Human-like delay between downloads
|
||||
if j < len(download_links) - 1:
|
||||
delay_time = random.uniform(2, 5)
|
||||
self._emit_progress("download_delay", {
|
||||
"duration": delay_time,
|
||||
"remaining": len(download_links) - j - 1
|
||||
})
|
||||
time.sleep(delay_time)
|
||||
|
||||
total_comics_processed += 1
|
||||
|
||||
self._emit_progress("comic_completed", {
|
||||
"title": title,
|
||||
"downloads_triggered": download_count,
|
||||
"page_number": page_num,
|
||||
"comic_index": i
|
||||
})
|
||||
|
||||
# Take a longer break every 5 comics
|
||||
if i % 5 == 0 and i < len(comic_urls):
|
||||
break_time = random.uniform(3, 7)
|
||||
self._emit_progress("comic_batch_break", {
|
||||
"duration": break_time,
|
||||
"comics_processed": i
|
||||
})
|
||||
time.sleep(break_time)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error processing {url}: {e}"
|
||||
errors.append(error_msg)
|
||||
self._emit_progress("comic_error", {
|
||||
"url": url,
|
||||
"error": str(e)
|
||||
})
|
||||
# Human would pause after an error
|
||||
self.human_delay(2, 4)
|
||||
|
||||
# Close tab and switch back
|
||||
try:
|
||||
self.driver.close()
|
||||
self.driver.switch_to.window(self.driver.window_handles[0])
|
||||
except:
|
||||
# Handle case where tab might have closed itself
|
||||
if len(self.driver.window_handles) > 0:
|
||||
self.driver.switch_to.window(self.driver.window_handles[0])
|
||||
|
||||
# Vary the delay between comics
|
||||
self.human_delay(1, 3)
|
||||
|
||||
self._emit_progress("page_completed", {
|
||||
"page_number": page_num,
|
||||
"comics_processed": len(comic_urls)
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error processing page {page_num}: {e}"
|
||||
errors.append(error_msg)
|
||||
self._emit_progress("page_error", {
|
||||
"page_number": page_num,
|
||||
"error": str(e)
|
||||
})
|
||||
|
||||
# Generate summary
|
||||
summary = {
|
||||
"success": not self._stop_requested,
|
||||
"total_pages_processed": min(page_num - start_page + 1, total_pages) if 'page_num' in locals() else 0,
|
||||
"total_comics_processed": total_comics_processed,
|
||||
"total_downloads_triggered": total_downloads_triggered,
|
||||
"errors": errors,
|
||||
"cancelled": self._stop_requested
|
||||
}
|
||||
|
||||
self._emit_progress("scraping_completed", summary)
|
||||
|
||||
return summary
|
||||
|
||||
def close(self):
|
||||
"""Close the browser and clean up resources."""
|
||||
try:
|
||||
self.driver.quit()
|
||||
self._emit_progress("scraper_closed", {})
|
||||
except Exception as e:
|
||||
self._emit_progress("scraper_close_error", {"error": str(e)})
|
||||
301
core/scraper_thread.py
Normal file
301
core/scraper_thread.py
Normal file
@@ -0,0 +1,301 @@
|
||||
"""
|
||||
QThread wrapper for the Scraper class with PyQt signals for GUI communication.
|
||||
"""
|
||||
|
||||
from PyQt5.QtCore import QThread, pyqtSignal
|
||||
from .scraper import Scraper
|
||||
import time
|
||||
|
||||
|
||||
class ScraperThread(QThread):
|
||||
"""
|
||||
Thread wrapper for the Scraper class that converts callback events to PyQt signals.
|
||||
|
||||
This class runs the scraper in a separate thread and emits signals that can be
|
||||
connected to GUI components for real-time updates.
|
||||
"""
|
||||
|
||||
# Login-related signals
|
||||
login_started = pyqtSignal(str) # username
|
||||
login_success = pyqtSignal(str) # username
|
||||
login_failed = pyqtSignal(str, str) # username, error_message
|
||||
|
||||
# Scraping progress signals
|
||||
scraping_started = pyqtSignal(int, int, int) # start_page, end_page, total_pages
|
||||
scraping_completed = pyqtSignal(dict) # summary dictionary
|
||||
|
||||
# Page-level progress signals
|
||||
page_started = pyqtSignal(int, int, int, str) # page_number, page_index, total_pages, url
|
||||
page_completed = pyqtSignal(int, int) # page_number, comics_processed
|
||||
page_comics_found = pyqtSignal(int, int) # page_number, comic_count
|
||||
page_error = pyqtSignal(int, str) # page_number, error_message
|
||||
|
||||
# Comic-level progress signals
|
||||
comic_started = pyqtSignal(int, int, int, str) # page_number, comic_index, total_comics, url
|
||||
comic_completed = pyqtSignal(str, int, int, int) # title, downloads_triggered, page_number, comic_index
|
||||
comic_title_extracted = pyqtSignal(str, str) # title, url
|
||||
comic_error = pyqtSignal(str, str) # url, error_message
|
||||
|
||||
# Download-related signals
|
||||
download_links_found = pyqtSignal(str, int) # title, download_count
|
||||
download_started = pyqtSignal(str, str, int, int) # file_name, url, index, total
|
||||
download_triggered = pyqtSignal(str) # url
|
||||
download_failed = pyqtSignal(str, str) # url, error_message
|
||||
|
||||
# General status and control signals
|
||||
status_update = pyqtSignal(str) # general status message
|
||||
error_occurred = pyqtSignal(str) # error message
|
||||
delay_started = pyqtSignal(float) # duration
|
||||
stop_requested = pyqtSignal()
|
||||
|
||||
# Navigation signals
|
||||
navigation_started = pyqtSignal(str) # url
|
||||
navigation_completed = pyqtSignal(str) # url
|
||||
|
||||
# Break and timing signals
|
||||
page_break_started = pyqtSignal(float, int) # duration, page_number
|
||||
short_break = pyqtSignal(float, int) # duration, page_number
|
||||
comic_batch_break = pyqtSignal(float, int) # duration, comics_processed
|
||||
download_delay = pyqtSignal(float, int) # duration, remaining_downloads
|
||||
|
||||
def __init__(self, username, password, start_page, end_page, scraping_mode=0, headless=True):
|
||||
"""
|
||||
Initialize the scraper thread.
|
||||
|
||||
Args:
|
||||
username (str): EBoek.info username
|
||||
password (str): EBoek.info password
|
||||
start_page (int): Starting page number
|
||||
end_page (int): Ending page number
|
||||
scraping_mode (int): Scraping mode (0=All Comics, 1=Latest Comics)
|
||||
headless (bool): Whether to run Chrome in headless mode
|
||||
"""
|
||||
super().__init__()
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.start_page = start_page
|
||||
self.end_page = end_page
|
||||
self.scraping_mode = scraping_mode
|
||||
self.headless = headless
|
||||
self.scraper = None
|
||||
self._is_running = False
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Main thread execution method.
|
||||
This runs in the separate thread and should not be called directly.
|
||||
"""
|
||||
try:
|
||||
self._is_running = True
|
||||
|
||||
# Initialize scraper with progress callback
|
||||
self.scraper = Scraper(
|
||||
headless=self.headless,
|
||||
progress_callback=self._handle_scraper_progress,
|
||||
scraping_mode=self.scraping_mode
|
||||
)
|
||||
|
||||
# Perform login
|
||||
self.login_started.emit(self.username)
|
||||
login_success = self.scraper.login(self.username, self.password)
|
||||
|
||||
if not login_success:
|
||||
self.login_failed.emit(self.username, "Login failed. Please check your credentials.")
|
||||
return
|
||||
|
||||
# Check if stop was requested during login
|
||||
if self.scraper._stop_requested:
|
||||
return
|
||||
|
||||
# Start scraping
|
||||
summary = self.scraper.scrape(self.start_page, self.end_page)
|
||||
|
||||
# Emit completion signal
|
||||
self.scraping_completed.emit(summary)
|
||||
|
||||
except Exception as e:
|
||||
self.error_occurred.emit(f"Unexpected error: {str(e)}")
|
||||
finally:
|
||||
# Clean up
|
||||
if self.scraper:
|
||||
self.scraper.close()
|
||||
self._is_running = False
|
||||
|
||||
def _handle_scraper_progress(self, event_type, data):
|
||||
"""
|
||||
Handle progress callbacks from the Scraper and convert them to PyQt signals.
|
||||
|
||||
Args:
|
||||
event_type (str): Type of event from the scraper
|
||||
data (dict): Event data
|
||||
"""
|
||||
try:
|
||||
# Login events
|
||||
if event_type == "login_started":
|
||||
# Already handled in run() method
|
||||
pass
|
||||
elif event_type == "login_success":
|
||||
self.login_success.emit(data.get("username", ""))
|
||||
elif event_type == "login_failed":
|
||||
self.login_failed.emit(data.get("username", ""), data.get("error", "Unknown error"))
|
||||
|
||||
# Scraping events
|
||||
elif event_type == "scraping_started":
|
||||
self.scraping_started.emit(
|
||||
data.get("start_page", 1),
|
||||
data.get("end_page", 1),
|
||||
data.get("total_pages", 1)
|
||||
)
|
||||
elif event_type == "scraping_completed":
|
||||
self.scraping_completed.emit(data)
|
||||
|
||||
# Page events
|
||||
elif event_type == "page_started":
|
||||
self.page_started.emit(
|
||||
data.get("page_number", 1),
|
||||
data.get("page_index", 1),
|
||||
data.get("total_pages", 1),
|
||||
data.get("url", "")
|
||||
)
|
||||
elif event_type == "page_completed":
|
||||
self.page_completed.emit(
|
||||
data.get("page_number", 1),
|
||||
data.get("comics_processed", 0)
|
||||
)
|
||||
elif event_type == "page_comics_found":
|
||||
self.page_comics_found.emit(
|
||||
data.get("page_number", 1),
|
||||
data.get("comic_count", 0)
|
||||
)
|
||||
elif event_type == "page_error":
|
||||
self.page_error.emit(
|
||||
data.get("page_number", 1),
|
||||
data.get("error", "Unknown error")
|
||||
)
|
||||
|
||||
# Comic events
|
||||
elif event_type == "comic_started":
|
||||
self.comic_started.emit(
|
||||
data.get("page_number", 1),
|
||||
data.get("comic_index", 1),
|
||||
data.get("total_comics", 1),
|
||||
data.get("url", "")
|
||||
)
|
||||
elif event_type == "comic_completed":
|
||||
self.comic_completed.emit(
|
||||
data.get("title", "Unknown"),
|
||||
data.get("downloads_triggered", 0),
|
||||
data.get("page_number", 1),
|
||||
data.get("comic_index", 1)
|
||||
)
|
||||
elif event_type == "comic_title_extracted":
|
||||
self.comic_title_extracted.emit(
|
||||
data.get("title", "Unknown"),
|
||||
data.get("url", "")
|
||||
)
|
||||
elif event_type == "comic_error":
|
||||
self.comic_error.emit(
|
||||
data.get("url", ""),
|
||||
data.get("error", "Unknown error")
|
||||
)
|
||||
|
||||
# Download events
|
||||
elif event_type == "download_links_found":
|
||||
self.download_links_found.emit(
|
||||
data.get("title", "Unknown"),
|
||||
data.get("download_count", 0)
|
||||
)
|
||||
elif event_type == "download_started":
|
||||
self.download_started.emit(
|
||||
data.get("file_name", ""),
|
||||
data.get("url", ""),
|
||||
data.get("index", 1),
|
||||
data.get("total", 1)
|
||||
)
|
||||
elif event_type == "download_triggered":
|
||||
self.download_triggered.emit(data.get("url", ""))
|
||||
elif event_type == "download_failed":
|
||||
self.download_failed.emit(
|
||||
data.get("url", ""),
|
||||
data.get("error", "Unknown error")
|
||||
)
|
||||
|
||||
# Navigation events
|
||||
elif event_type == "navigation_started":
|
||||
self.navigation_started.emit(data.get("url", ""))
|
||||
elif event_type == "navigation_completed":
|
||||
self.navigation_completed.emit(data.get("url", ""))
|
||||
|
||||
# Timing and break events
|
||||
elif event_type == "delay_started":
|
||||
self.delay_started.emit(data.get("duration", 0.0))
|
||||
elif event_type == "page_break_started":
|
||||
self.page_break_started.emit(
|
||||
data.get("duration", 0.0),
|
||||
data.get("page_number", 1)
|
||||
)
|
||||
elif event_type == "short_break":
|
||||
self.short_break.emit(
|
||||
data.get("duration", 0.0),
|
||||
data.get("page_number", 1)
|
||||
)
|
||||
elif event_type == "comic_batch_break":
|
||||
self.comic_batch_break.emit(
|
||||
data.get("duration", 0.0),
|
||||
data.get("comics_processed", 0)
|
||||
)
|
||||
elif event_type == "download_delay":
|
||||
self.download_delay.emit(
|
||||
data.get("duration", 0.0),
|
||||
data.get("remaining", 0)
|
||||
)
|
||||
|
||||
# Control events
|
||||
elif event_type == "stop_requested":
|
||||
self.stop_requested.emit()
|
||||
|
||||
# General status updates
|
||||
elif event_type in ["scraper_initialized", "scraper_closed", "scraper_close_error"]:
|
||||
self.status_update.emit(f"{event_type}: {data}")
|
||||
|
||||
# Emit a general status update for events we didn't specifically handle
|
||||
else:
|
||||
self.status_update.emit(f"{event_type}: {data}")
|
||||
|
||||
except Exception as e:
|
||||
# Don't let signal emission errors crash the scraper
|
||||
self.error_occurred.emit(f"Signal emission error: {str(e)}")
|
||||
|
||||
def request_stop(self):
|
||||
"""
|
||||
Request the scraper to stop gracefully.
|
||||
This can be called from the main thread (GUI).
|
||||
"""
|
||||
if self.scraper:
|
||||
self.scraper.request_stop()
|
||||
|
||||
def is_running(self):
|
||||
"""
|
||||
Check if the scraper thread is currently running.
|
||||
|
||||
Returns:
|
||||
bool: True if the thread is running
|
||||
"""
|
||||
return self._is_running and self.isRunning()
|
||||
|
||||
def get_progress_summary(self):
|
||||
"""
|
||||
Get a summary of the current progress.
|
||||
This is thread-safe and can be called from the main thread.
|
||||
|
||||
Returns:
|
||||
dict: Current progress information
|
||||
"""
|
||||
if not self.scraper:
|
||||
return {"status": "not_started"}
|
||||
|
||||
return {
|
||||
"status": "running" if self._is_running else "stopped",
|
||||
"stop_requested": self.scraper._stop_requested if self.scraper else False,
|
||||
"thread_running": self.isRunning()
|
||||
}
|
||||
Reference in New Issue
Block a user