feat: Enhance scraper timing configuration with user-defined settings and UI adjustments

This commit is contained in:
Louis Mylle
2026-01-10 17:36:09 +01:00
parent 3248060317
commit 610a20d12d
5 changed files with 422 additions and 67 deletions

View File

@@ -25,7 +25,7 @@ class Scraper:
callback mechanisms for progress updates to a GUI application.
"""
def __init__(self, headless=False, progress_callback=None, scraping_mode=0):
def __init__(self, headless=False, progress_callback=None, scraping_mode=0, timing_config=None):
"""
Initialize the scraper with optional GUI callback support.
@@ -34,11 +34,16 @@ class Scraper:
progress_callback (callable): Optional callback function for progress updates
Callback signature: callback(event_type: str, data: dict)
scraping_mode (int): Scraping mode (0=All Comics, 1=Latest Comics)
timing_config (dict): Timing configuration settings
"""
self.progress_callback = progress_callback
self._stop_requested = False
self.scraping_mode = scraping_mode
# Set up timing configuration with defaults
self.timing = timing_config or {}
self._setup_timing_defaults()
# Set up Chrome options with anti-detection measures
chrome_options = Options()
if headless:
@@ -103,16 +108,42 @@ class Scraper:
self._stop_requested = True
self._emit_progress("stop_requested", {})
def human_delay(self, min_sec=0.5, max_sec=2):
def _setup_timing_defaults(self):
"""Set up timing configuration with default values."""
defaults = {
'action_delay_min': 0.5,
'action_delay_max': 2.0,
'page_break_chance': 70,
'page_break_min': 15,
'page_break_max': 45,
'batch_break_interval': 5,
'batch_break_min': 3,
'batch_break_max': 7,
'typing_delay': 0.1,
}
# Fill in any missing values with defaults
for key, default_value in defaults.items():
if key not in self.timing:
self.timing[key] = default_value
def human_delay(self, min_sec=None, max_sec=None):
"""
Simulate human-like delay with cancellation support.
Args:
min_sec (float): Minimum delay time
max_sec (float): Maximum delay time
min_sec (float): Minimum delay time (uses config default if None)
max_sec (float): Maximum delay time (uses config default if None)
"""
if self._stop_requested:
return
# Use configured timing if no specific values provided
if min_sec is None:
min_sec = self.timing['action_delay_min']
if max_sec is None:
max_sec = self.timing['action_delay_max']
delay_time = random.uniform(min_sec, max_sec)
self._emit_progress("delay_started", {"duration": delay_time})
time.sleep(delay_time)
@@ -129,7 +160,8 @@ class Scraper:
if self._stop_requested:
return
element.send_keys(char)
time.sleep(random.uniform(0.05, 0.15))
typing_delay = self.timing['typing_delay']
time.sleep(random.uniform(typing_delay * 0.5, typing_delay * 1.5))
def navigate(self, url):
"""
@@ -330,8 +362,9 @@ class Scraper:
# Take a break between pages (more likely and longer)
if page_num > start_page:
if random.random() < 0.7: # 70% chance of break
break_time = random.uniform(15, 45) # 15-45 seconds
break_chance = self.timing['page_break_chance'] / 100.0
if random.random() < break_chance:
break_time = random.uniform(self.timing['page_break_min'], self.timing['page_break_max'])
self._emit_progress("page_break_started", {
"duration": break_time,
"page_number": page_num
@@ -446,9 +479,10 @@ class Scraper:
"comic_index": i
})
# Take a longer break every 5 comics
if i % 5 == 0 and i < len(comic_urls):
break_time = random.uniform(3, 7)
# Take a longer break every N comics (configurable)
batch_interval = self.timing['batch_break_interval']
if i % batch_interval == 0 and i < len(comic_urls):
break_time = random.uniform(self.timing['batch_break_min'], self.timing['batch_break_max'])
self._emit_progress("comic_batch_break", {
"duration": break_time,
"comics_processed": i