feat: Add installation scripts for Windows and Unix-based systems
- Created `install_and_run.bat` for Windows installation and setup. - Created `install_and_run.sh` for Unix-based systems installation and setup. - Removed `main.py` as it is no longer needed. - Updated `requirements.txt` to specify package versions and added PyQt5. - Deleted `start.bat` as it is redundant. - Added unit tests for core functionality and scraping modes. - Implemented input validation utilities in `utils/validators.py`. - Added support for dual scraping modes in the scraper.
This commit is contained in:
301
core/scraper_thread.py
Normal file
301
core/scraper_thread.py
Normal file
@@ -0,0 +1,301 @@
|
||||
"""
|
||||
QThread wrapper for the Scraper class with PyQt signals for GUI communication.
|
||||
"""
|
||||
|
||||
from PyQt5.QtCore import QThread, pyqtSignal
|
||||
from .scraper import Scraper
|
||||
import time
|
||||
|
||||
|
||||
class ScraperThread(QThread):
|
||||
"""
|
||||
Thread wrapper for the Scraper class that converts callback events to PyQt signals.
|
||||
|
||||
This class runs the scraper in a separate thread and emits signals that can be
|
||||
connected to GUI components for real-time updates.
|
||||
"""
|
||||
|
||||
# Login-related signals
|
||||
login_started = pyqtSignal(str) # username
|
||||
login_success = pyqtSignal(str) # username
|
||||
login_failed = pyqtSignal(str, str) # username, error_message
|
||||
|
||||
# Scraping progress signals
|
||||
scraping_started = pyqtSignal(int, int, int) # start_page, end_page, total_pages
|
||||
scraping_completed = pyqtSignal(dict) # summary dictionary
|
||||
|
||||
# Page-level progress signals
|
||||
page_started = pyqtSignal(int, int, int, str) # page_number, page_index, total_pages, url
|
||||
page_completed = pyqtSignal(int, int) # page_number, comics_processed
|
||||
page_comics_found = pyqtSignal(int, int) # page_number, comic_count
|
||||
page_error = pyqtSignal(int, str) # page_number, error_message
|
||||
|
||||
# Comic-level progress signals
|
||||
comic_started = pyqtSignal(int, int, int, str) # page_number, comic_index, total_comics, url
|
||||
comic_completed = pyqtSignal(str, int, int, int) # title, downloads_triggered, page_number, comic_index
|
||||
comic_title_extracted = pyqtSignal(str, str) # title, url
|
||||
comic_error = pyqtSignal(str, str) # url, error_message
|
||||
|
||||
# Download-related signals
|
||||
download_links_found = pyqtSignal(str, int) # title, download_count
|
||||
download_started = pyqtSignal(str, str, int, int) # file_name, url, index, total
|
||||
download_triggered = pyqtSignal(str) # url
|
||||
download_failed = pyqtSignal(str, str) # url, error_message
|
||||
|
||||
# General status and control signals
|
||||
status_update = pyqtSignal(str) # general status message
|
||||
error_occurred = pyqtSignal(str) # error message
|
||||
delay_started = pyqtSignal(float) # duration
|
||||
stop_requested = pyqtSignal()
|
||||
|
||||
# Navigation signals
|
||||
navigation_started = pyqtSignal(str) # url
|
||||
navigation_completed = pyqtSignal(str) # url
|
||||
|
||||
# Break and timing signals
|
||||
page_break_started = pyqtSignal(float, int) # duration, page_number
|
||||
short_break = pyqtSignal(float, int) # duration, page_number
|
||||
comic_batch_break = pyqtSignal(float, int) # duration, comics_processed
|
||||
download_delay = pyqtSignal(float, int) # duration, remaining_downloads
|
||||
|
||||
def __init__(self, username, password, start_page, end_page, scraping_mode=0, headless=True):
|
||||
"""
|
||||
Initialize the scraper thread.
|
||||
|
||||
Args:
|
||||
username (str): EBoek.info username
|
||||
password (str): EBoek.info password
|
||||
start_page (int): Starting page number
|
||||
end_page (int): Ending page number
|
||||
scraping_mode (int): Scraping mode (0=All Comics, 1=Latest Comics)
|
||||
headless (bool): Whether to run Chrome in headless mode
|
||||
"""
|
||||
super().__init__()
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.start_page = start_page
|
||||
self.end_page = end_page
|
||||
self.scraping_mode = scraping_mode
|
||||
self.headless = headless
|
||||
self.scraper = None
|
||||
self._is_running = False
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Main thread execution method.
|
||||
This runs in the separate thread and should not be called directly.
|
||||
"""
|
||||
try:
|
||||
self._is_running = True
|
||||
|
||||
# Initialize scraper with progress callback
|
||||
self.scraper = Scraper(
|
||||
headless=self.headless,
|
||||
progress_callback=self._handle_scraper_progress,
|
||||
scraping_mode=self.scraping_mode
|
||||
)
|
||||
|
||||
# Perform login
|
||||
self.login_started.emit(self.username)
|
||||
login_success = self.scraper.login(self.username, self.password)
|
||||
|
||||
if not login_success:
|
||||
self.login_failed.emit(self.username, "Login failed. Please check your credentials.")
|
||||
return
|
||||
|
||||
# Check if stop was requested during login
|
||||
if self.scraper._stop_requested:
|
||||
return
|
||||
|
||||
# Start scraping
|
||||
summary = self.scraper.scrape(self.start_page, self.end_page)
|
||||
|
||||
# Emit completion signal
|
||||
self.scraping_completed.emit(summary)
|
||||
|
||||
except Exception as e:
|
||||
self.error_occurred.emit(f"Unexpected error: {str(e)}")
|
||||
finally:
|
||||
# Clean up
|
||||
if self.scraper:
|
||||
self.scraper.close()
|
||||
self._is_running = False
|
||||
|
||||
def _handle_scraper_progress(self, event_type, data):
|
||||
"""
|
||||
Handle progress callbacks from the Scraper and convert them to PyQt signals.
|
||||
|
||||
Args:
|
||||
event_type (str): Type of event from the scraper
|
||||
data (dict): Event data
|
||||
"""
|
||||
try:
|
||||
# Login events
|
||||
if event_type == "login_started":
|
||||
# Already handled in run() method
|
||||
pass
|
||||
elif event_type == "login_success":
|
||||
self.login_success.emit(data.get("username", ""))
|
||||
elif event_type == "login_failed":
|
||||
self.login_failed.emit(data.get("username", ""), data.get("error", "Unknown error"))
|
||||
|
||||
# Scraping events
|
||||
elif event_type == "scraping_started":
|
||||
self.scraping_started.emit(
|
||||
data.get("start_page", 1),
|
||||
data.get("end_page", 1),
|
||||
data.get("total_pages", 1)
|
||||
)
|
||||
elif event_type == "scraping_completed":
|
||||
self.scraping_completed.emit(data)
|
||||
|
||||
# Page events
|
||||
elif event_type == "page_started":
|
||||
self.page_started.emit(
|
||||
data.get("page_number", 1),
|
||||
data.get("page_index", 1),
|
||||
data.get("total_pages", 1),
|
||||
data.get("url", "")
|
||||
)
|
||||
elif event_type == "page_completed":
|
||||
self.page_completed.emit(
|
||||
data.get("page_number", 1),
|
||||
data.get("comics_processed", 0)
|
||||
)
|
||||
elif event_type == "page_comics_found":
|
||||
self.page_comics_found.emit(
|
||||
data.get("page_number", 1),
|
||||
data.get("comic_count", 0)
|
||||
)
|
||||
elif event_type == "page_error":
|
||||
self.page_error.emit(
|
||||
data.get("page_number", 1),
|
||||
data.get("error", "Unknown error")
|
||||
)
|
||||
|
||||
# Comic events
|
||||
elif event_type == "comic_started":
|
||||
self.comic_started.emit(
|
||||
data.get("page_number", 1),
|
||||
data.get("comic_index", 1),
|
||||
data.get("total_comics", 1),
|
||||
data.get("url", "")
|
||||
)
|
||||
elif event_type == "comic_completed":
|
||||
self.comic_completed.emit(
|
||||
data.get("title", "Unknown"),
|
||||
data.get("downloads_triggered", 0),
|
||||
data.get("page_number", 1),
|
||||
data.get("comic_index", 1)
|
||||
)
|
||||
elif event_type == "comic_title_extracted":
|
||||
self.comic_title_extracted.emit(
|
||||
data.get("title", "Unknown"),
|
||||
data.get("url", "")
|
||||
)
|
||||
elif event_type == "comic_error":
|
||||
self.comic_error.emit(
|
||||
data.get("url", ""),
|
||||
data.get("error", "Unknown error")
|
||||
)
|
||||
|
||||
# Download events
|
||||
elif event_type == "download_links_found":
|
||||
self.download_links_found.emit(
|
||||
data.get("title", "Unknown"),
|
||||
data.get("download_count", 0)
|
||||
)
|
||||
elif event_type == "download_started":
|
||||
self.download_started.emit(
|
||||
data.get("file_name", ""),
|
||||
data.get("url", ""),
|
||||
data.get("index", 1),
|
||||
data.get("total", 1)
|
||||
)
|
||||
elif event_type == "download_triggered":
|
||||
self.download_triggered.emit(data.get("url", ""))
|
||||
elif event_type == "download_failed":
|
||||
self.download_failed.emit(
|
||||
data.get("url", ""),
|
||||
data.get("error", "Unknown error")
|
||||
)
|
||||
|
||||
# Navigation events
|
||||
elif event_type == "navigation_started":
|
||||
self.navigation_started.emit(data.get("url", ""))
|
||||
elif event_type == "navigation_completed":
|
||||
self.navigation_completed.emit(data.get("url", ""))
|
||||
|
||||
# Timing and break events
|
||||
elif event_type == "delay_started":
|
||||
self.delay_started.emit(data.get("duration", 0.0))
|
||||
elif event_type == "page_break_started":
|
||||
self.page_break_started.emit(
|
||||
data.get("duration", 0.0),
|
||||
data.get("page_number", 1)
|
||||
)
|
||||
elif event_type == "short_break":
|
||||
self.short_break.emit(
|
||||
data.get("duration", 0.0),
|
||||
data.get("page_number", 1)
|
||||
)
|
||||
elif event_type == "comic_batch_break":
|
||||
self.comic_batch_break.emit(
|
||||
data.get("duration", 0.0),
|
||||
data.get("comics_processed", 0)
|
||||
)
|
||||
elif event_type == "download_delay":
|
||||
self.download_delay.emit(
|
||||
data.get("duration", 0.0),
|
||||
data.get("remaining", 0)
|
||||
)
|
||||
|
||||
# Control events
|
||||
elif event_type == "stop_requested":
|
||||
self.stop_requested.emit()
|
||||
|
||||
# General status updates
|
||||
elif event_type in ["scraper_initialized", "scraper_closed", "scraper_close_error"]:
|
||||
self.status_update.emit(f"{event_type}: {data}")
|
||||
|
||||
# Emit a general status update for events we didn't specifically handle
|
||||
else:
|
||||
self.status_update.emit(f"{event_type}: {data}")
|
||||
|
||||
except Exception as e:
|
||||
# Don't let signal emission errors crash the scraper
|
||||
self.error_occurred.emit(f"Signal emission error: {str(e)}")
|
||||
|
||||
def request_stop(self):
|
||||
"""
|
||||
Request the scraper to stop gracefully.
|
||||
This can be called from the main thread (GUI).
|
||||
"""
|
||||
if self.scraper:
|
||||
self.scraper.request_stop()
|
||||
|
||||
def is_running(self):
|
||||
"""
|
||||
Check if the scraper thread is currently running.
|
||||
|
||||
Returns:
|
||||
bool: True if the thread is running
|
||||
"""
|
||||
return self._is_running and self.isRunning()
|
||||
|
||||
def get_progress_summary(self):
|
||||
"""
|
||||
Get a summary of the current progress.
|
||||
This is thread-safe and can be called from the main thread.
|
||||
|
||||
Returns:
|
||||
dict: Current progress information
|
||||
"""
|
||||
if not self.scraper:
|
||||
return {"status": "not_started"}
|
||||
|
||||
return {
|
||||
"status": "running" if self._is_running else "stopped",
|
||||
"stop_requested": self.scraper._stop_requested if self.scraper else False,
|
||||
"thread_running": self.isRunning()
|
||||
}
|
||||
Reference in New Issue
Block a user