feat: Add installation scripts for Windows and Unix-based systems

- Created `install_and_run.bat` for Windows installation and setup.
- Created `install_and_run.sh` for Unix-based systems installation and setup.
- Removed `main.py` as it is no longer needed.
- Updated `requirements.txt` to specify package versions and added PyQt5.
- Deleted `start.bat` as it is redundant.
- Added unit tests for core functionality and scraping modes.
- Implemented input validation utilities in `utils/validators.py`.
- Added support for dual scraping modes in the scraper.
This commit is contained in:
Louis Mylle
2026-01-10 14:45:00 +01:00
parent 5f2fca226b
commit ea4cab15c3
19 changed files with 3731 additions and 335 deletions

1
core/__init__.py Normal file
View File

@@ -0,0 +1 @@
# Core scraping functionality

309
core/credentials.py Normal file
View File

@@ -0,0 +1,309 @@
"""
Simple JSON-based credential storage system for EBoek.info scraper.
"""
import json
import os
from pathlib import Path
import stat
class CredentialManager:
"""
Manages storage and retrieval of user credentials in a JSON config file.
Credentials are stored in the user's home directory in a hidden folder
with appropriate file permissions for basic security.
"""
def __init__(self, app_name="eboek_scraper"):
"""
Initialize the credential manager.
Args:
app_name (str): Application name for config directory
"""
self.app_name = app_name
self.config_dir = Path.home() / f".{app_name}"
self.config_file = self.config_dir / "config.json"
self._ensure_config_dir()
def _ensure_config_dir(self):
"""
Ensure the configuration directory exists with appropriate permissions.
"""
try:
if not self.config_dir.exists():
self.config_dir.mkdir(mode=0o700, exist_ok=True) # Only user can read/write/execute
# Ensure directory has correct permissions (user only)
if os.name != 'nt': # Unix-like systems (macOS, Linux)
os.chmod(self.config_dir, stat.S_IRWXU) # 700 permissions
except Exception as e:
# If we can't create the config directory, fall back to current directory
self.config_dir = Path(".")
self.config_file = self.config_dir / f".{self.app_name}_config.json"
def _load_config(self):
"""
Load the configuration file.
Returns:
dict: Configuration data, empty dict if file doesn't exist
"""
try:
if self.config_file.exists():
with open(self.config_file, 'r', encoding='utf-8') as f:
return json.load(f)
except (json.JSONDecodeError, IOError, PermissionError) as e:
# If there's any error reading the config, return empty dict
pass
return {}
def _save_config(self, config_data):
"""
Save configuration data to file.
Args:
config_data (dict): Configuration data to save
Returns:
bool: True if saved successfully, False otherwise
"""
try:
with open(self.config_file, 'w', encoding='utf-8') as f:
json.dump(config_data, f, indent=2, ensure_ascii=False)
# Set file permissions to be readable/writable by user only
if os.name != 'nt': # Unix-like systems
os.chmod(self.config_file, stat.S_IRUSR | stat.S_IWUSR) # 600 permissions
return True
except (IOError, PermissionError) as e:
return False
def save_credentials(self, username, password, remember=True):
"""
Save user credentials to the config file.
Args:
username (str): EBoek.info username
password (str): EBoek.info password
remember (bool): Whether to save credentials for future use
Returns:
bool: True if saved successfully, False otherwise
"""
if not remember:
# If remember is False, just clear any existing credentials
return self.clear_credentials()
try:
config = self._load_config()
config['credentials'] = {
'username': username,
'password': password,
'saved_at': str(Path.home()), # Just to know which user saved it
}
return self._save_config(config)
except Exception as e:
return False
def load_credentials(self):
"""
Load stored credentials.
Returns:
dict or None: Dictionary with 'username' and 'password' keys if found,
None if no credentials are stored
"""
try:
config = self._load_config()
credentials = config.get('credentials')
if credentials and 'username' in credentials and 'password' in credentials:
return {
'username': credentials['username'],
'password': credentials['password']
}
except Exception as e:
pass
return None
def has_saved_credentials(self):
"""
Check if there are saved credentials available.
Returns:
bool: True if credentials are available, False otherwise
"""
return self.load_credentials() is not None
def get_saved_username(self):
"""
Get the saved username without the password.
Returns:
str or None: Saved username if available, None otherwise
"""
credentials = self.load_credentials()
return credentials['username'] if credentials else None
def clear_credentials(self):
"""
Remove stored credentials from the config file.
Returns:
bool: True if cleared successfully, False otherwise
"""
try:
config = self._load_config()
if 'credentials' in config:
del config['credentials']
return self._save_config(config)
return True # No credentials to clear is success
except Exception as e:
return False
def validate_credentials(self, username, password):
"""
Basic validation of credential format.
Args:
username (str): Username to validate
password (str): Password to validate
Returns:
dict: Validation result with 'valid' bool and 'errors' list
"""
errors = []
if not username or not username.strip():
errors.append("Username cannot be empty")
elif len(username.strip()) < 2:
errors.append("Username must be at least 2 characters")
if not password or not password.strip():
errors.append("Password cannot be empty")
elif len(password) < 3:
errors.append("Password must be at least 3 characters")
return {
'valid': len(errors) == 0,
'errors': errors
}
def get_config_file_path(self):
"""
Get the path to the configuration file.
Returns:
Path: Path to the config file
"""
return self.config_file
def save_app_settings(self, settings):
"""
Save application settings (non-credential settings).
Args:
settings (dict): Application settings to save
Returns:
bool: True if saved successfully, False otherwise
"""
try:
config = self._load_config()
config['app_settings'] = settings
return self._save_config(config)
except Exception as e:
return False
def load_app_settings(self):
"""
Load application settings (non-credential settings).
Returns:
dict: Application settings, empty dict if none saved
"""
try:
config = self._load_config()
return config.get('app_settings', {})
except Exception as e:
return {}
def get_default_settings(self):
"""
Get default application settings.
Returns:
dict: Default settings
"""
return {
'headless_mode': True,
'verbose_logging': False,
'auto_save_credentials': True,
'download_path': str(Path.home() / "Downloads"),
'default_start_page': 1,
'default_end_page': 1,
'scraping_mode': 0 # 0=All Comics, 1=Latest Comics
}
def export_settings(self, export_path):
"""
Export settings (excluding credentials) to a file.
Args:
export_path (str or Path): Path to export settings to
Returns:
bool: True if exported successfully, False otherwise
"""
try:
config = self._load_config()
# Remove credentials from export
export_config = {k: v for k, v in config.items() if k != 'credentials'}
with open(export_path, 'w', encoding='utf-8') as f:
json.dump(export_config, f, indent=2, ensure_ascii=False)
return True
except Exception as e:
return False
def import_settings(self, import_path):
"""
Import settings (excluding credentials) from a file.
Args:
import_path (str or Path): Path to import settings from
Returns:
bool: True if imported successfully, False otherwise
"""
try:
with open(import_path, 'r', encoding='utf-8') as f:
imported_config = json.load(f)
# Don't import credentials for security
if 'credentials' in imported_config:
del imported_config['credentials']
# Merge with existing config
config = self._load_config()
config.update(imported_config)
return self._save_config(config)
except Exception as e:
return False

513
core/scraper.py Normal file
View File

@@ -0,0 +1,513 @@
"""
Core scraper functionality extracted from main.py with callback support for GUI integration.
"""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import time
import random
import os
import sys
from pathlib import Path
# Disable SSL verification warnings and errors
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
class Scraper:
"""
EBoek.info web scraper with GUI callback support.
This class handles the core scraping functionality while providing
callback mechanisms for progress updates to a GUI application.
"""
def __init__(self, headless=False, progress_callback=None, scraping_mode=0):
"""
Initialize the scraper with optional GUI callback support.
Args:
headless (bool): Whether to run Chrome in headless mode
progress_callback (callable): Optional callback function for progress updates
Callback signature: callback(event_type: str, data: dict)
scraping_mode (int): Scraping mode (0=All Comics, 1=Latest Comics)
"""
self.progress_callback = progress_callback
self._stop_requested = False
self.scraping_mode = scraping_mode
# Set up Chrome options with anti-detection measures
chrome_options = Options()
if headless:
chrome_options.add_argument('--headless')
# Fix SSL and certificate issues
chrome_options.add_argument('--ignore-ssl-errors')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--disable-web-security')
chrome_options.add_argument('--allow-running-insecure-content')
chrome_options.add_argument('--disable-extensions')
# Fix DevTools connection issues
chrome_options.add_argument('--remote-debugging-port=0')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--no-sandbox')
# Make it look more human
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
# Suppress logging
chrome_options.add_experimental_option('excludeSwitches', ['enable-logging'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('--disable-logging')
chrome_options.add_argument('--log-level=3')
# Set cross-platform download directory
downloads_path = str(Path.home() / "Downloads")
prefs = {
"download.default_directory": downloads_path,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
}
chrome_options.add_experimental_option("prefs", prefs)
self.driver = webdriver.Chrome(options=chrome_options)
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
self._emit_progress("scraper_initialized", {"headless": headless, "downloads_path": downloads_path})
def _emit_progress(self, event_type, data):
"""
Internal method to emit progress updates via callback.
Args:
event_type (str): Type of event (e.g., 'page_started', 'comic_completed')
data (dict): Event data
"""
if self.progress_callback:
try:
self.progress_callback(event_type, data)
except Exception as e:
# Don't let callback errors crash the scraper
pass
def request_stop(self):
"""Request the scraper to stop gracefully at the next opportunity."""
self._stop_requested = True
self._emit_progress("stop_requested", {})
def human_delay(self, min_sec=0.5, max_sec=2):
"""
Simulate human-like delay with cancellation support.
Args:
min_sec (float): Minimum delay time
max_sec (float): Maximum delay time
"""
if self._stop_requested:
return
delay_time = random.uniform(min_sec, max_sec)
self._emit_progress("delay_started", {"duration": delay_time})
time.sleep(delay_time)
def human_type(self, element, text):
"""
Type text character by character with human-like delays.
Args:
element: Selenium web element to type into
text (str): Text to type
"""
for char in text:
if self._stop_requested:
return
element.send_keys(char)
time.sleep(random.uniform(0.05, 0.15))
def navigate(self, url):
"""
Navigate to a URL with human-like delay.
Args:
url (str): URL to navigate to
"""
if self._stop_requested:
return False
self._emit_progress("navigation_started", {"url": url})
self.driver.get(url)
self.human_delay(1, 3)
self._emit_progress("navigation_completed", {"url": url})
return True
def login(self, username, password):
"""
Login to EBoek.info with provided credentials.
Args:
username (str): Username for login
password (str): Password for login
Returns:
bool: True if login successful, False otherwise
"""
if self._stop_requested:
return False
self._emit_progress("login_started", {"username": username})
try:
self.driver.get("https://eboek.info/komerin")
self.human_delay(2, 4)
if self._stop_requested:
return False
# Find and fill username field
username_field = self.driver.find_element(By.CSS_SELECTOR, "input[type='text']")
self.human_type(username_field, username)
self.human_delay(0.5, 1)
if self._stop_requested:
return False
# Find and fill password field
password_field = self.driver.find_element(By.CSS_SELECTOR, "input[type='password']")
self.human_type(password_field, password)
self.human_delay(0.5, 1.5)
if self._stop_requested:
return False
# Submit the form
submit_button = self.driver.find_element(By.CSS_SELECTOR, "input[type='submit']")
submit_button.click()
self.human_delay(2, 4)
# Check if login was successful (basic check)
# You could enhance this by checking for specific elements that appear after login
current_url = self.driver.current_url
login_successful = "komerin" not in current_url
if login_successful:
self._emit_progress("login_success", {"username": username})
else:
self._emit_progress("login_failed", {"username": username, "error": "Login appears to have failed"})
return login_successful
except Exception as e:
self._emit_progress("login_failed", {"username": username, "error": str(e)})
return False
def trigger_download(self, url):
"""
Open URL in new tab to trigger browser download.
Args:
url (str): URL of file to download
Returns:
bool: True if download triggered successfully
"""
if self._stop_requested:
return False
try:
# Store current window handle
current_window = self.driver.current_window_handle
# Use JavaScript to open URL in new tab with same session
self.driver.execute_script(f"window.open('{url}', '_blank');")
# Wait for download to complete and tab to auto-close
self.human_delay(3, 5)
# Switch back to original window
self.driver.switch_to.window(current_window)
self._emit_progress("download_triggered", {"url": url})
return True
except Exception as e:
self._emit_progress("download_failed", {"url": url, "error": str(e)})
return False
def scrape(self, start_page=1, end_page=1):
"""
Scrape comics from specified page range.
Args:
start_page (int): Starting page number
end_page (int): Ending page number
Returns:
dict: Summary of scraping results
"""
if self._stop_requested:
return {"success": False, "reason": "Cancelled before starting"}
# Determine base URL and URL pattern based on scraping mode
if self.scraping_mode == 1: # Latest Comics
base_url = "https://eboek.info/laatste"
mode_name = "Latest Comics"
else: # All Comics (default)
base_url = "https://eboek.info/stripverhalen-alle"
mode_name = "All Comics"
total_pages = end_page - start_page + 1
total_comics_processed = 0
total_downloads_triggered = 0
errors = []
self._emit_progress("scraping_started", {
"start_page": start_page,
"end_page": end_page,
"total_pages": total_pages,
"mode": mode_name
})
for page_num in range(start_page, end_page + 1):
if self._stop_requested:
break
# Construct page URL based on scraping mode
if self.scraping_mode == 1: # Latest Comics
page_url = f"{base_url}?_page={page_num}&ref=dw"
else: # All Comics
if page_num == 1:
page_url = base_url
else:
page_url = f"{base_url}/page/{page_num}/"
current_page_index = page_num - start_page + 1
self._emit_progress("page_started", {
"page_number": page_num,
"page_index": current_page_index,
"total_pages": total_pages,
"url": page_url
})
# Navigate to the page
if not self.navigate(page_url):
continue
# Scroll down a bit like a human would to see content
self.driver.execute_script("window.scrollTo(0, 300)")
self.human_delay(1, 2)
if self._stop_requested:
break
try:
# Find all comic strip links using mode-specific CSS selectors
if self.scraping_mode == 1: # Latest Comics page
# For "laatste" page - target only title links to avoid duplicates
comic_links = self.driver.find_elements(By.CSS_SELECTOR, '.pt-cv-wrapper .pt-cv-ifield h5.pt-cv-title a')
else: # All Comics page (default)
# For "stripverhalen-alle" page - original selector
comic_links = self.driver.find_elements(By.CSS_SELECTOR, 'h2.post-title a')
comic_count = len(comic_links)
self._emit_progress("page_comics_found", {
"page_number": page_num,
"comic_count": comic_count
})
# Store URLs first to avoid stale element issues
comic_urls = [link.get_attribute('href') for link in comic_links]
# Take a break between pages (more likely and longer)
if page_num > start_page:
if random.random() < 0.7: # 70% chance of break
break_time = random.uniform(15, 45) # 15-45 seconds
self._emit_progress("page_break_started", {
"duration": break_time,
"page_number": page_num
})
time.sleep(break_time)
else:
# Even if no long break, always pause a bit
short_break = random.uniform(5, 10)
self._emit_progress("short_break", {
"duration": short_break,
"page_number": page_num
})
time.sleep(short_break)
# Process all comics on this page
for i, url in enumerate(comic_urls, 1):
if self._stop_requested:
break
self._emit_progress("comic_started", {
"page_number": page_num,
"comic_index": i,
"total_comics": comic_count,
"url": url
})
# Random chance to scroll on main page before clicking
if random.random() < 0.4:
scroll_amount = random.randint(100, 500)
self.driver.execute_script(f"window.scrollBy(0, {scroll_amount})")
self.human_delay(0.5, 1.5)
# Open in new tab to keep main page
self.driver.execute_script("window.open('');")
self.driver.switch_to.window(self.driver.window_handles[-1])
try:
self.driver.get(url)
self.human_delay(2, 4)
if self._stop_requested:
break
# Sometimes scroll down to see the content
if random.random() < 0.6:
self.driver.execute_script("window.scrollTo(0, 400)")
self.human_delay(0.5, 1.5)
# Extract title
try:
title = self.driver.find_element(By.CSS_SELECTOR, 'h1.entry-title').text
except:
title = f"Comic {i} on page {page_num}"
self._emit_progress("comic_title_extracted", {
"title": title,
"url": url
})
# Small delay before clicking download
self.human_delay(0.8, 2)
if self._stop_requested:
break
# Execute the downloadLinks() JavaScript function
self.driver.execute_script("downloadLinks()")
self.human_delay(1.5, 3)
# Find all download links in the table
download_links = self.driver.find_elements(By.CSS_SELECTOR, 'table a')
download_count = len(download_links)
self._emit_progress("download_links_found", {
"title": title,
"download_count": download_count
})
# Trigger download for each file
for j, link in enumerate(download_links):
if self._stop_requested:
break
file_url = link.get_attribute('href')
file_name = link.text.strip()
self._emit_progress("download_started", {
"file_name": file_name,
"url": file_url,
"index": j + 1,
"total": download_count
})
if self.trigger_download(file_url):
total_downloads_triggered += 1
# Human-like delay between downloads
if j < len(download_links) - 1:
delay_time = random.uniform(2, 5)
self._emit_progress("download_delay", {
"duration": delay_time,
"remaining": len(download_links) - j - 1
})
time.sleep(delay_time)
total_comics_processed += 1
self._emit_progress("comic_completed", {
"title": title,
"downloads_triggered": download_count,
"page_number": page_num,
"comic_index": i
})
# Take a longer break every 5 comics
if i % 5 == 0 and i < len(comic_urls):
break_time = random.uniform(3, 7)
self._emit_progress("comic_batch_break", {
"duration": break_time,
"comics_processed": i
})
time.sleep(break_time)
except Exception as e:
error_msg = f"Error processing {url}: {e}"
errors.append(error_msg)
self._emit_progress("comic_error", {
"url": url,
"error": str(e)
})
# Human would pause after an error
self.human_delay(2, 4)
# Close tab and switch back
try:
self.driver.close()
self.driver.switch_to.window(self.driver.window_handles[0])
except:
# Handle case where tab might have closed itself
if len(self.driver.window_handles) > 0:
self.driver.switch_to.window(self.driver.window_handles[0])
# Vary the delay between comics
self.human_delay(1, 3)
self._emit_progress("page_completed", {
"page_number": page_num,
"comics_processed": len(comic_urls)
})
except Exception as e:
error_msg = f"Error processing page {page_num}: {e}"
errors.append(error_msg)
self._emit_progress("page_error", {
"page_number": page_num,
"error": str(e)
})
# Generate summary
summary = {
"success": not self._stop_requested,
"total_pages_processed": min(page_num - start_page + 1, total_pages) if 'page_num' in locals() else 0,
"total_comics_processed": total_comics_processed,
"total_downloads_triggered": total_downloads_triggered,
"errors": errors,
"cancelled": self._stop_requested
}
self._emit_progress("scraping_completed", summary)
return summary
def close(self):
"""Close the browser and clean up resources."""
try:
self.driver.quit()
self._emit_progress("scraper_closed", {})
except Exception as e:
self._emit_progress("scraper_close_error", {"error": str(e)})

301
core/scraper_thread.py Normal file
View File

@@ -0,0 +1,301 @@
"""
QThread wrapper for the Scraper class with PyQt signals for GUI communication.
"""
from PyQt5.QtCore import QThread, pyqtSignal
from .scraper import Scraper
import time
class ScraperThread(QThread):
"""
Thread wrapper for the Scraper class that converts callback events to PyQt signals.
This class runs the scraper in a separate thread and emits signals that can be
connected to GUI components for real-time updates.
"""
# Login-related signals
login_started = pyqtSignal(str) # username
login_success = pyqtSignal(str) # username
login_failed = pyqtSignal(str, str) # username, error_message
# Scraping progress signals
scraping_started = pyqtSignal(int, int, int) # start_page, end_page, total_pages
scraping_completed = pyqtSignal(dict) # summary dictionary
# Page-level progress signals
page_started = pyqtSignal(int, int, int, str) # page_number, page_index, total_pages, url
page_completed = pyqtSignal(int, int) # page_number, comics_processed
page_comics_found = pyqtSignal(int, int) # page_number, comic_count
page_error = pyqtSignal(int, str) # page_number, error_message
# Comic-level progress signals
comic_started = pyqtSignal(int, int, int, str) # page_number, comic_index, total_comics, url
comic_completed = pyqtSignal(str, int, int, int) # title, downloads_triggered, page_number, comic_index
comic_title_extracted = pyqtSignal(str, str) # title, url
comic_error = pyqtSignal(str, str) # url, error_message
# Download-related signals
download_links_found = pyqtSignal(str, int) # title, download_count
download_started = pyqtSignal(str, str, int, int) # file_name, url, index, total
download_triggered = pyqtSignal(str) # url
download_failed = pyqtSignal(str, str) # url, error_message
# General status and control signals
status_update = pyqtSignal(str) # general status message
error_occurred = pyqtSignal(str) # error message
delay_started = pyqtSignal(float) # duration
stop_requested = pyqtSignal()
# Navigation signals
navigation_started = pyqtSignal(str) # url
navigation_completed = pyqtSignal(str) # url
# Break and timing signals
page_break_started = pyqtSignal(float, int) # duration, page_number
short_break = pyqtSignal(float, int) # duration, page_number
comic_batch_break = pyqtSignal(float, int) # duration, comics_processed
download_delay = pyqtSignal(float, int) # duration, remaining_downloads
def __init__(self, username, password, start_page, end_page, scraping_mode=0, headless=True):
"""
Initialize the scraper thread.
Args:
username (str): EBoek.info username
password (str): EBoek.info password
start_page (int): Starting page number
end_page (int): Ending page number
scraping_mode (int): Scraping mode (0=All Comics, 1=Latest Comics)
headless (bool): Whether to run Chrome in headless mode
"""
super().__init__()
self.username = username
self.password = password
self.start_page = start_page
self.end_page = end_page
self.scraping_mode = scraping_mode
self.headless = headless
self.scraper = None
self._is_running = False
def run(self):
"""
Main thread execution method.
This runs in the separate thread and should not be called directly.
"""
try:
self._is_running = True
# Initialize scraper with progress callback
self.scraper = Scraper(
headless=self.headless,
progress_callback=self._handle_scraper_progress,
scraping_mode=self.scraping_mode
)
# Perform login
self.login_started.emit(self.username)
login_success = self.scraper.login(self.username, self.password)
if not login_success:
self.login_failed.emit(self.username, "Login failed. Please check your credentials.")
return
# Check if stop was requested during login
if self.scraper._stop_requested:
return
# Start scraping
summary = self.scraper.scrape(self.start_page, self.end_page)
# Emit completion signal
self.scraping_completed.emit(summary)
except Exception as e:
self.error_occurred.emit(f"Unexpected error: {str(e)}")
finally:
# Clean up
if self.scraper:
self.scraper.close()
self._is_running = False
def _handle_scraper_progress(self, event_type, data):
"""
Handle progress callbacks from the Scraper and convert them to PyQt signals.
Args:
event_type (str): Type of event from the scraper
data (dict): Event data
"""
try:
# Login events
if event_type == "login_started":
# Already handled in run() method
pass
elif event_type == "login_success":
self.login_success.emit(data.get("username", ""))
elif event_type == "login_failed":
self.login_failed.emit(data.get("username", ""), data.get("error", "Unknown error"))
# Scraping events
elif event_type == "scraping_started":
self.scraping_started.emit(
data.get("start_page", 1),
data.get("end_page", 1),
data.get("total_pages", 1)
)
elif event_type == "scraping_completed":
self.scraping_completed.emit(data)
# Page events
elif event_type == "page_started":
self.page_started.emit(
data.get("page_number", 1),
data.get("page_index", 1),
data.get("total_pages", 1),
data.get("url", "")
)
elif event_type == "page_completed":
self.page_completed.emit(
data.get("page_number", 1),
data.get("comics_processed", 0)
)
elif event_type == "page_comics_found":
self.page_comics_found.emit(
data.get("page_number", 1),
data.get("comic_count", 0)
)
elif event_type == "page_error":
self.page_error.emit(
data.get("page_number", 1),
data.get("error", "Unknown error")
)
# Comic events
elif event_type == "comic_started":
self.comic_started.emit(
data.get("page_number", 1),
data.get("comic_index", 1),
data.get("total_comics", 1),
data.get("url", "")
)
elif event_type == "comic_completed":
self.comic_completed.emit(
data.get("title", "Unknown"),
data.get("downloads_triggered", 0),
data.get("page_number", 1),
data.get("comic_index", 1)
)
elif event_type == "comic_title_extracted":
self.comic_title_extracted.emit(
data.get("title", "Unknown"),
data.get("url", "")
)
elif event_type == "comic_error":
self.comic_error.emit(
data.get("url", ""),
data.get("error", "Unknown error")
)
# Download events
elif event_type == "download_links_found":
self.download_links_found.emit(
data.get("title", "Unknown"),
data.get("download_count", 0)
)
elif event_type == "download_started":
self.download_started.emit(
data.get("file_name", ""),
data.get("url", ""),
data.get("index", 1),
data.get("total", 1)
)
elif event_type == "download_triggered":
self.download_triggered.emit(data.get("url", ""))
elif event_type == "download_failed":
self.download_failed.emit(
data.get("url", ""),
data.get("error", "Unknown error")
)
# Navigation events
elif event_type == "navigation_started":
self.navigation_started.emit(data.get("url", ""))
elif event_type == "navigation_completed":
self.navigation_completed.emit(data.get("url", ""))
# Timing and break events
elif event_type == "delay_started":
self.delay_started.emit(data.get("duration", 0.0))
elif event_type == "page_break_started":
self.page_break_started.emit(
data.get("duration", 0.0),
data.get("page_number", 1)
)
elif event_type == "short_break":
self.short_break.emit(
data.get("duration", 0.0),
data.get("page_number", 1)
)
elif event_type == "comic_batch_break":
self.comic_batch_break.emit(
data.get("duration", 0.0),
data.get("comics_processed", 0)
)
elif event_type == "download_delay":
self.download_delay.emit(
data.get("duration", 0.0),
data.get("remaining", 0)
)
# Control events
elif event_type == "stop_requested":
self.stop_requested.emit()
# General status updates
elif event_type in ["scraper_initialized", "scraper_closed", "scraper_close_error"]:
self.status_update.emit(f"{event_type}: {data}")
# Emit a general status update for events we didn't specifically handle
else:
self.status_update.emit(f"{event_type}: {data}")
except Exception as e:
# Don't let signal emission errors crash the scraper
self.error_occurred.emit(f"Signal emission error: {str(e)}")
def request_stop(self):
"""
Request the scraper to stop gracefully.
This can be called from the main thread (GUI).
"""
if self.scraper:
self.scraper.request_stop()
def is_running(self):
"""
Check if the scraper thread is currently running.
Returns:
bool: True if the thread is running
"""
return self._is_running and self.isRunning()
def get_progress_summary(self):
"""
Get a summary of the current progress.
This is thread-safe and can be called from the main thread.
Returns:
dict: Current progress information
"""
if not self.scraper:
return {"status": "not_started"}
return {
"status": "running" if self._is_running else "stopped",
"stop_requested": self.scraper._stop_requested if self.scraper else False,
"thread_running": self.isRunning()
}