feat: Enhance scraper timing configuration with user-defined settings and UI adjustments
This commit is contained in:
@@ -25,7 +25,7 @@ class Scraper:
|
||||
callback mechanisms for progress updates to a GUI application.
|
||||
"""
|
||||
|
||||
def __init__(self, headless=False, progress_callback=None, scraping_mode=0):
|
||||
def __init__(self, headless=False, progress_callback=None, scraping_mode=0, timing_config=None):
|
||||
"""
|
||||
Initialize the scraper with optional GUI callback support.
|
||||
|
||||
@@ -34,11 +34,16 @@ class Scraper:
|
||||
progress_callback (callable): Optional callback function for progress updates
|
||||
Callback signature: callback(event_type: str, data: dict)
|
||||
scraping_mode (int): Scraping mode (0=All Comics, 1=Latest Comics)
|
||||
timing_config (dict): Timing configuration settings
|
||||
"""
|
||||
self.progress_callback = progress_callback
|
||||
self._stop_requested = False
|
||||
self.scraping_mode = scraping_mode
|
||||
|
||||
# Set up timing configuration with defaults
|
||||
self.timing = timing_config or {}
|
||||
self._setup_timing_defaults()
|
||||
|
||||
# Set up Chrome options with anti-detection measures
|
||||
chrome_options = Options()
|
||||
if headless:
|
||||
@@ -103,16 +108,42 @@ class Scraper:
|
||||
self._stop_requested = True
|
||||
self._emit_progress("stop_requested", {})
|
||||
|
||||
def human_delay(self, min_sec=0.5, max_sec=2):
|
||||
def _setup_timing_defaults(self):
|
||||
"""Set up timing configuration with default values."""
|
||||
defaults = {
|
||||
'action_delay_min': 0.5,
|
||||
'action_delay_max': 2.0,
|
||||
'page_break_chance': 70,
|
||||
'page_break_min': 15,
|
||||
'page_break_max': 45,
|
||||
'batch_break_interval': 5,
|
||||
'batch_break_min': 3,
|
||||
'batch_break_max': 7,
|
||||
'typing_delay': 0.1,
|
||||
}
|
||||
|
||||
# Fill in any missing values with defaults
|
||||
for key, default_value in defaults.items():
|
||||
if key not in self.timing:
|
||||
self.timing[key] = default_value
|
||||
|
||||
def human_delay(self, min_sec=None, max_sec=None):
|
||||
"""
|
||||
Simulate human-like delay with cancellation support.
|
||||
|
||||
Args:
|
||||
min_sec (float): Minimum delay time
|
||||
max_sec (float): Maximum delay time
|
||||
min_sec (float): Minimum delay time (uses config default if None)
|
||||
max_sec (float): Maximum delay time (uses config default if None)
|
||||
"""
|
||||
if self._stop_requested:
|
||||
return
|
||||
|
||||
# Use configured timing if no specific values provided
|
||||
if min_sec is None:
|
||||
min_sec = self.timing['action_delay_min']
|
||||
if max_sec is None:
|
||||
max_sec = self.timing['action_delay_max']
|
||||
|
||||
delay_time = random.uniform(min_sec, max_sec)
|
||||
self._emit_progress("delay_started", {"duration": delay_time})
|
||||
time.sleep(delay_time)
|
||||
@@ -129,7 +160,8 @@ class Scraper:
|
||||
if self._stop_requested:
|
||||
return
|
||||
element.send_keys(char)
|
||||
time.sleep(random.uniform(0.05, 0.15))
|
||||
typing_delay = self.timing['typing_delay']
|
||||
time.sleep(random.uniform(typing_delay * 0.5, typing_delay * 1.5))
|
||||
|
||||
def navigate(self, url):
|
||||
"""
|
||||
@@ -330,8 +362,9 @@ class Scraper:
|
||||
|
||||
# Take a break between pages (more likely and longer)
|
||||
if page_num > start_page:
|
||||
if random.random() < 0.7: # 70% chance of break
|
||||
break_time = random.uniform(15, 45) # 15-45 seconds
|
||||
break_chance = self.timing['page_break_chance'] / 100.0
|
||||
if random.random() < break_chance:
|
||||
break_time = random.uniform(self.timing['page_break_min'], self.timing['page_break_max'])
|
||||
self._emit_progress("page_break_started", {
|
||||
"duration": break_time,
|
||||
"page_number": page_num
|
||||
@@ -446,9 +479,10 @@ class Scraper:
|
||||
"comic_index": i
|
||||
})
|
||||
|
||||
# Take a longer break every 5 comics
|
||||
if i % 5 == 0 and i < len(comic_urls):
|
||||
break_time = random.uniform(3, 7)
|
||||
# Take a longer break every N comics (configurable)
|
||||
batch_interval = self.timing['batch_break_interval']
|
||||
if i % batch_interval == 0 and i < len(comic_urls):
|
||||
break_time = random.uniform(self.timing['batch_break_min'], self.timing['batch_break_max'])
|
||||
self._emit_progress("comic_batch_break", {
|
||||
"duration": break_time,
|
||||
"comics_processed": i
|
||||
|
||||
Reference in New Issue
Block a user