feat: Enhance scraper timing configuration with user-defined settings and UI adjustments

This commit is contained in:
Louis Mylle
2026-01-10 17:36:09 +01:00
parent 3248060317
commit 610a20d12d
5 changed files with 422 additions and 67 deletions

View File

@@ -257,7 +257,17 @@ class CredentialManager:
'download_path': str(Path.home() / "Downloads"),
'default_start_page': 1,
'default_end_page': 1,
'scraping_mode': 0 # 0=All Comics, 1=Latest Comics
'scraping_mode': 0, # 0=All Comics, 1=Latest Comics
# Timing configuration defaults
'action_delay_min': 0.5, # Minimum delay between actions (seconds)
'action_delay_max': 2.0, # Maximum delay between actions (seconds)
'page_break_chance': 70, # Percentage chance of taking a break between pages
'page_break_min': 15, # Minimum page break duration (seconds)
'page_break_max': 45, # Maximum page break duration (seconds)
'batch_break_interval': 5, # Take a break every N comics
'batch_break_min': 3, # Minimum batch break duration (seconds)
'batch_break_max': 7, # Maximum batch break duration (seconds)
'typing_delay': 0.1, # Delay between character typing (seconds)
}
def export_settings(self, export_path):

View File

@@ -25,7 +25,7 @@ class Scraper:
callback mechanisms for progress updates to a GUI application.
"""
def __init__(self, headless=False, progress_callback=None, scraping_mode=0):
def __init__(self, headless=False, progress_callback=None, scraping_mode=0, timing_config=None):
"""
Initialize the scraper with optional GUI callback support.
@@ -34,11 +34,16 @@ class Scraper:
progress_callback (callable): Optional callback function for progress updates
Callback signature: callback(event_type: str, data: dict)
scraping_mode (int): Scraping mode (0=All Comics, 1=Latest Comics)
timing_config (dict): Timing configuration settings
"""
self.progress_callback = progress_callback
self._stop_requested = False
self.scraping_mode = scraping_mode
# Set up timing configuration with defaults
self.timing = timing_config or {}
self._setup_timing_defaults()
# Set up Chrome options with anti-detection measures
chrome_options = Options()
if headless:
@@ -103,16 +108,42 @@ class Scraper:
self._stop_requested = True
self._emit_progress("stop_requested", {})
def human_delay(self, min_sec=0.5, max_sec=2):
def _setup_timing_defaults(self):
"""Set up timing configuration with default values."""
defaults = {
'action_delay_min': 0.5,
'action_delay_max': 2.0,
'page_break_chance': 70,
'page_break_min': 15,
'page_break_max': 45,
'batch_break_interval': 5,
'batch_break_min': 3,
'batch_break_max': 7,
'typing_delay': 0.1,
}
# Fill in any missing values with defaults
for key, default_value in defaults.items():
if key not in self.timing:
self.timing[key] = default_value
def human_delay(self, min_sec=None, max_sec=None):
"""
Simulate human-like delay with cancellation support.
Args:
min_sec (float): Minimum delay time
max_sec (float): Maximum delay time
min_sec (float): Minimum delay time (uses config default if None)
max_sec (float): Maximum delay time (uses config default if None)
"""
if self._stop_requested:
return
# Use configured timing if no specific values provided
if min_sec is None:
min_sec = self.timing['action_delay_min']
if max_sec is None:
max_sec = self.timing['action_delay_max']
delay_time = random.uniform(min_sec, max_sec)
self._emit_progress("delay_started", {"duration": delay_time})
time.sleep(delay_time)
@@ -129,7 +160,8 @@ class Scraper:
if self._stop_requested:
return
element.send_keys(char)
time.sleep(random.uniform(0.05, 0.15))
typing_delay = self.timing['typing_delay']
time.sleep(random.uniform(typing_delay * 0.5, typing_delay * 1.5))
def navigate(self, url):
"""
@@ -330,8 +362,9 @@ class Scraper:
# Take a break between pages (more likely and longer)
if page_num > start_page:
if random.random() < 0.7: # 70% chance of break
break_time = random.uniform(15, 45) # 15-45 seconds
break_chance = self.timing['page_break_chance'] / 100.0
if random.random() < break_chance:
break_time = random.uniform(self.timing['page_break_min'], self.timing['page_break_max'])
self._emit_progress("page_break_started", {
"duration": break_time,
"page_number": page_num
@@ -446,9 +479,10 @@ class Scraper:
"comic_index": i
})
# Take a longer break every 5 comics
if i % 5 == 0 and i < len(comic_urls):
break_time = random.uniform(3, 7)
# Take a longer break every N comics (configurable)
batch_interval = self.timing['batch_break_interval']
if i % batch_interval == 0 and i < len(comic_urls):
break_time = random.uniform(self.timing['batch_break_min'], self.timing['batch_break_max'])
self._emit_progress("comic_batch_break", {
"duration": break_time,
"comics_processed": i

View File

@@ -58,7 +58,7 @@ class ScraperThread(QThread):
comic_batch_break = pyqtSignal(float, int) # duration, comics_processed
download_delay = pyqtSignal(float, int) # duration, remaining_downloads
def __init__(self, username, password, start_page, end_page, scraping_mode=0, headless=True):
def __init__(self, username, password, start_page, end_page, scraping_mode=0, headless=True, timing_config=None):
"""
Initialize the scraper thread.
@@ -69,6 +69,7 @@ class ScraperThread(QThread):
end_page (int): Ending page number
scraping_mode (int): Scraping mode (0=All Comics, 1=Latest Comics)
headless (bool): Whether to run Chrome in headless mode
timing_config (dict): Timing configuration settings
"""
super().__init__()
self.username = username
@@ -77,6 +78,7 @@ class ScraperThread(QThread):
self.end_page = end_page
self.scraping_mode = scraping_mode
self.headless = headless
self.timing_config = timing_config
self.scraper = None
self._is_running = False
@@ -92,7 +94,8 @@ class ScraperThread(QThread):
self.scraper = Scraper(
headless=self.headless,
progress_callback=self._handle_scraper_progress,
scraping_mode=self.scraping_mode
scraping_mode=self.scraping_mode,
timing_config=self.timing_config
)
# Perform login