eboek.info-scraper/core/scraper_thread.py
Louis Mylle ea4cab15c3 feat: Add installation scripts for Windows and Unix-based systems
- Created `install_and_run.bat` for Windows installation and setup.
- Created `install_and_run.sh` for Unix-based systems installation and setup.
- Removed `main.py` as it is no longer needed.
- Updated `requirements.txt` to specify package versions and added PyQt5.
- Deleted `start.bat` as it is redundant.
- Added unit tests for core functionality and scraping modes.
- Implemented input validation utilities in `utils/validators.py`.
- Added support for dual scraping modes in the scraper.
2026-01-10 14:45:00 +01:00

301 lines
11 KiB
Python

"""
QThread wrapper for the Scraper class with PyQt signals for GUI communication.
"""
from PyQt5.QtCore import QThread, pyqtSignal
from .scraper import Scraper
import time
class ScraperThread(QThread):
"""
Thread wrapper for the Scraper class that converts callback events to PyQt signals.
This class runs the scraper in a separate thread and emits signals that can be
connected to GUI components for real-time updates.
"""
# Login-related signals
login_started = pyqtSignal(str) # username
login_success = pyqtSignal(str) # username
login_failed = pyqtSignal(str, str) # username, error_message
# Scraping progress signals
scraping_started = pyqtSignal(int, int, int) # start_page, end_page, total_pages
scraping_completed = pyqtSignal(dict) # summary dictionary
# Page-level progress signals
page_started = pyqtSignal(int, int, int, str) # page_number, page_index, total_pages, url
page_completed = pyqtSignal(int, int) # page_number, comics_processed
page_comics_found = pyqtSignal(int, int) # page_number, comic_count
page_error = pyqtSignal(int, str) # page_number, error_message
# Comic-level progress signals
comic_started = pyqtSignal(int, int, int, str) # page_number, comic_index, total_comics, url
comic_completed = pyqtSignal(str, int, int, int) # title, downloads_triggered, page_number, comic_index
comic_title_extracted = pyqtSignal(str, str) # title, url
comic_error = pyqtSignal(str, str) # url, error_message
# Download-related signals
download_links_found = pyqtSignal(str, int) # title, download_count
download_started = pyqtSignal(str, str, int, int) # file_name, url, index, total
download_triggered = pyqtSignal(str) # url
download_failed = pyqtSignal(str, str) # url, error_message
# General status and control signals
status_update = pyqtSignal(str) # general status message
error_occurred = pyqtSignal(str) # error message
delay_started = pyqtSignal(float) # duration
stop_requested = pyqtSignal()
# Navigation signals
navigation_started = pyqtSignal(str) # url
navigation_completed = pyqtSignal(str) # url
# Break and timing signals
page_break_started = pyqtSignal(float, int) # duration, page_number
short_break = pyqtSignal(float, int) # duration, page_number
comic_batch_break = pyqtSignal(float, int) # duration, comics_processed
download_delay = pyqtSignal(float, int) # duration, remaining_downloads
def __init__(self, username, password, start_page, end_page, scraping_mode=0, headless=True):
"""
Initialize the scraper thread.
Args:
username (str): EBoek.info username
password (str): EBoek.info password
start_page (int): Starting page number
end_page (int): Ending page number
scraping_mode (int): Scraping mode (0=All Comics, 1=Latest Comics)
headless (bool): Whether to run Chrome in headless mode
"""
super().__init__()
self.username = username
self.password = password
self.start_page = start_page
self.end_page = end_page
self.scraping_mode = scraping_mode
self.headless = headless
self.scraper = None
self._is_running = False
def run(self):
"""
Main thread execution method.
This runs in the separate thread and should not be called directly.
"""
try:
self._is_running = True
# Initialize scraper with progress callback
self.scraper = Scraper(
headless=self.headless,
progress_callback=self._handle_scraper_progress,
scraping_mode=self.scraping_mode
)
# Perform login
self.login_started.emit(self.username)
login_success = self.scraper.login(self.username, self.password)
if not login_success:
self.login_failed.emit(self.username, "Login failed. Please check your credentials.")
return
# Check if stop was requested during login
if self.scraper._stop_requested:
return
# Start scraping
summary = self.scraper.scrape(self.start_page, self.end_page)
# Emit completion signal
self.scraping_completed.emit(summary)
except Exception as e:
self.error_occurred.emit(f"Unexpected error: {str(e)}")
finally:
# Clean up
if self.scraper:
self.scraper.close()
self._is_running = False
def _handle_scraper_progress(self, event_type, data):
"""
Handle progress callbacks from the Scraper and convert them to PyQt signals.
Args:
event_type (str): Type of event from the scraper
data (dict): Event data
"""
try:
# Login events
if event_type == "login_started":
# Already handled in run() method
pass
elif event_type == "login_success":
self.login_success.emit(data.get("username", ""))
elif event_type == "login_failed":
self.login_failed.emit(data.get("username", ""), data.get("error", "Unknown error"))
# Scraping events
elif event_type == "scraping_started":
self.scraping_started.emit(
data.get("start_page", 1),
data.get("end_page", 1),
data.get("total_pages", 1)
)
elif event_type == "scraping_completed":
self.scraping_completed.emit(data)
# Page events
elif event_type == "page_started":
self.page_started.emit(
data.get("page_number", 1),
data.get("page_index", 1),
data.get("total_pages", 1),
data.get("url", "")
)
elif event_type == "page_completed":
self.page_completed.emit(
data.get("page_number", 1),
data.get("comics_processed", 0)
)
elif event_type == "page_comics_found":
self.page_comics_found.emit(
data.get("page_number", 1),
data.get("comic_count", 0)
)
elif event_type == "page_error":
self.page_error.emit(
data.get("page_number", 1),
data.get("error", "Unknown error")
)
# Comic events
elif event_type == "comic_started":
self.comic_started.emit(
data.get("page_number", 1),
data.get("comic_index", 1),
data.get("total_comics", 1),
data.get("url", "")
)
elif event_type == "comic_completed":
self.comic_completed.emit(
data.get("title", "Unknown"),
data.get("downloads_triggered", 0),
data.get("page_number", 1),
data.get("comic_index", 1)
)
elif event_type == "comic_title_extracted":
self.comic_title_extracted.emit(
data.get("title", "Unknown"),
data.get("url", "")
)
elif event_type == "comic_error":
self.comic_error.emit(
data.get("url", ""),
data.get("error", "Unknown error")
)
# Download events
elif event_type == "download_links_found":
self.download_links_found.emit(
data.get("title", "Unknown"),
data.get("download_count", 0)
)
elif event_type == "download_started":
self.download_started.emit(
data.get("file_name", ""),
data.get("url", ""),
data.get("index", 1),
data.get("total", 1)
)
elif event_type == "download_triggered":
self.download_triggered.emit(data.get("url", ""))
elif event_type == "download_failed":
self.download_failed.emit(
data.get("url", ""),
data.get("error", "Unknown error")
)
# Navigation events
elif event_type == "navigation_started":
self.navigation_started.emit(data.get("url", ""))
elif event_type == "navigation_completed":
self.navigation_completed.emit(data.get("url", ""))
# Timing and break events
elif event_type == "delay_started":
self.delay_started.emit(data.get("duration", 0.0))
elif event_type == "page_break_started":
self.page_break_started.emit(
data.get("duration", 0.0),
data.get("page_number", 1)
)
elif event_type == "short_break":
self.short_break.emit(
data.get("duration", 0.0),
data.get("page_number", 1)
)
elif event_type == "comic_batch_break":
self.comic_batch_break.emit(
data.get("duration", 0.0),
data.get("comics_processed", 0)
)
elif event_type == "download_delay":
self.download_delay.emit(
data.get("duration", 0.0),
data.get("remaining", 0)
)
# Control events
elif event_type == "stop_requested":
self.stop_requested.emit()
# General status updates
elif event_type in ["scraper_initialized", "scraper_closed", "scraper_close_error"]:
self.status_update.emit(f"{event_type}: {data}")
# Emit a general status update for events we didn't specifically handle
else:
self.status_update.emit(f"{event_type}: {data}")
except Exception as e:
# Don't let signal emission errors crash the scraper
self.error_occurred.emit(f"Signal emission error: {str(e)}")
def request_stop(self):
"""
Request the scraper to stop gracefully.
This can be called from the main thread (GUI).
"""
if self.scraper:
self.scraper.request_stop()
def is_running(self):
"""
Check if the scraper thread is currently running.
Returns:
bool: True if the thread is running
"""
return self._is_running and self.isRunning()
def get_progress_summary(self):
"""
Get a summary of the current progress.
This is thread-safe and can be called from the main thread.
Returns:
dict: Current progress information
"""
if not self.scraper:
return {"status": "not_started"}
return {
"status": "running" if self._is_running else "stopped",
"stop_requested": self.scraper._stop_requested if self.scraper else False,
"thread_running": self.isRunning()
}