feat: Enhance scraper timing configuration with user-defined settings and UI adjustments
This commit is contained in:
@@ -257,7 +257,17 @@ class CredentialManager:
|
||||
'download_path': str(Path.home() / "Downloads"),
|
||||
'default_start_page': 1,
|
||||
'default_end_page': 1,
|
||||
'scraping_mode': 0 # 0=All Comics, 1=Latest Comics
|
||||
'scraping_mode': 0, # 0=All Comics, 1=Latest Comics
|
||||
# Timing configuration defaults
|
||||
'action_delay_min': 0.5, # Minimum delay between actions (seconds)
|
||||
'action_delay_max': 2.0, # Maximum delay between actions (seconds)
|
||||
'page_break_chance': 70, # Percentage chance of taking a break between pages
|
||||
'page_break_min': 15, # Minimum page break duration (seconds)
|
||||
'page_break_max': 45, # Maximum page break duration (seconds)
|
||||
'batch_break_interval': 5, # Take a break every N comics
|
||||
'batch_break_min': 3, # Minimum batch break duration (seconds)
|
||||
'batch_break_max': 7, # Maximum batch break duration (seconds)
|
||||
'typing_delay': 0.1, # Delay between character typing (seconds)
|
||||
}
|
||||
|
||||
def export_settings(self, export_path):
|
||||
|
||||
@@ -25,7 +25,7 @@ class Scraper:
|
||||
callback mechanisms for progress updates to a GUI application.
|
||||
"""
|
||||
|
||||
def __init__(self, headless=False, progress_callback=None, scraping_mode=0):
|
||||
def __init__(self, headless=False, progress_callback=None, scraping_mode=0, timing_config=None):
|
||||
"""
|
||||
Initialize the scraper with optional GUI callback support.
|
||||
|
||||
@@ -34,11 +34,16 @@ class Scraper:
|
||||
progress_callback (callable): Optional callback function for progress updates
|
||||
Callback signature: callback(event_type: str, data: dict)
|
||||
scraping_mode (int): Scraping mode (0=All Comics, 1=Latest Comics)
|
||||
timing_config (dict): Timing configuration settings
|
||||
"""
|
||||
self.progress_callback = progress_callback
|
||||
self._stop_requested = False
|
||||
self.scraping_mode = scraping_mode
|
||||
|
||||
# Set up timing configuration with defaults
|
||||
self.timing = timing_config or {}
|
||||
self._setup_timing_defaults()
|
||||
|
||||
# Set up Chrome options with anti-detection measures
|
||||
chrome_options = Options()
|
||||
if headless:
|
||||
@@ -103,16 +108,42 @@ class Scraper:
|
||||
self._stop_requested = True
|
||||
self._emit_progress("stop_requested", {})
|
||||
|
||||
def human_delay(self, min_sec=0.5, max_sec=2):
|
||||
def _setup_timing_defaults(self):
|
||||
"""Set up timing configuration with default values."""
|
||||
defaults = {
|
||||
'action_delay_min': 0.5,
|
||||
'action_delay_max': 2.0,
|
||||
'page_break_chance': 70,
|
||||
'page_break_min': 15,
|
||||
'page_break_max': 45,
|
||||
'batch_break_interval': 5,
|
||||
'batch_break_min': 3,
|
||||
'batch_break_max': 7,
|
||||
'typing_delay': 0.1,
|
||||
}
|
||||
|
||||
# Fill in any missing values with defaults
|
||||
for key, default_value in defaults.items():
|
||||
if key not in self.timing:
|
||||
self.timing[key] = default_value
|
||||
|
||||
def human_delay(self, min_sec=None, max_sec=None):
|
||||
"""
|
||||
Simulate human-like delay with cancellation support.
|
||||
|
||||
Args:
|
||||
min_sec (float): Minimum delay time
|
||||
max_sec (float): Maximum delay time
|
||||
min_sec (float): Minimum delay time (uses config default if None)
|
||||
max_sec (float): Maximum delay time (uses config default if None)
|
||||
"""
|
||||
if self._stop_requested:
|
||||
return
|
||||
|
||||
# Use configured timing if no specific values provided
|
||||
if min_sec is None:
|
||||
min_sec = self.timing['action_delay_min']
|
||||
if max_sec is None:
|
||||
max_sec = self.timing['action_delay_max']
|
||||
|
||||
delay_time = random.uniform(min_sec, max_sec)
|
||||
self._emit_progress("delay_started", {"duration": delay_time})
|
||||
time.sleep(delay_time)
|
||||
@@ -129,7 +160,8 @@ class Scraper:
|
||||
if self._stop_requested:
|
||||
return
|
||||
element.send_keys(char)
|
||||
time.sleep(random.uniform(0.05, 0.15))
|
||||
typing_delay = self.timing['typing_delay']
|
||||
time.sleep(random.uniform(typing_delay * 0.5, typing_delay * 1.5))
|
||||
|
||||
def navigate(self, url):
|
||||
"""
|
||||
@@ -330,8 +362,9 @@ class Scraper:
|
||||
|
||||
# Take a break between pages (more likely and longer)
|
||||
if page_num > start_page:
|
||||
if random.random() < 0.7: # 70% chance of break
|
||||
break_time = random.uniform(15, 45) # 15-45 seconds
|
||||
break_chance = self.timing['page_break_chance'] / 100.0
|
||||
if random.random() < break_chance:
|
||||
break_time = random.uniform(self.timing['page_break_min'], self.timing['page_break_max'])
|
||||
self._emit_progress("page_break_started", {
|
||||
"duration": break_time,
|
||||
"page_number": page_num
|
||||
@@ -446,9 +479,10 @@ class Scraper:
|
||||
"comic_index": i
|
||||
})
|
||||
|
||||
# Take a longer break every 5 comics
|
||||
if i % 5 == 0 and i < len(comic_urls):
|
||||
break_time = random.uniform(3, 7)
|
||||
# Take a longer break every N comics (configurable)
|
||||
batch_interval = self.timing['batch_break_interval']
|
||||
if i % batch_interval == 0 and i < len(comic_urls):
|
||||
break_time = random.uniform(self.timing['batch_break_min'], self.timing['batch_break_max'])
|
||||
self._emit_progress("comic_batch_break", {
|
||||
"duration": break_time,
|
||||
"comics_processed": i
|
||||
|
||||
@@ -58,7 +58,7 @@ class ScraperThread(QThread):
|
||||
comic_batch_break = pyqtSignal(float, int) # duration, comics_processed
|
||||
download_delay = pyqtSignal(float, int) # duration, remaining_downloads
|
||||
|
||||
def __init__(self, username, password, start_page, end_page, scraping_mode=0, headless=True):
|
||||
def __init__(self, username, password, start_page, end_page, scraping_mode=0, headless=True, timing_config=None):
|
||||
"""
|
||||
Initialize the scraper thread.
|
||||
|
||||
@@ -69,6 +69,7 @@ class ScraperThread(QThread):
|
||||
end_page (int): Ending page number
|
||||
scraping_mode (int): Scraping mode (0=All Comics, 1=Latest Comics)
|
||||
headless (bool): Whether to run Chrome in headless mode
|
||||
timing_config (dict): Timing configuration settings
|
||||
"""
|
||||
super().__init__()
|
||||
self.username = username
|
||||
@@ -77,6 +78,7 @@ class ScraperThread(QThread):
|
||||
self.end_page = end_page
|
||||
self.scraping_mode = scraping_mode
|
||||
self.headless = headless
|
||||
self.timing_config = timing_config
|
||||
self.scraper = None
|
||||
self._is_running = False
|
||||
|
||||
@@ -92,7 +94,8 @@ class ScraperThread(QThread):
|
||||
self.scraper = Scraper(
|
||||
headless=self.headless,
|
||||
progress_callback=self._handle_scraper_progress,
|
||||
scraping_mode=self.scraping_mode
|
||||
scraping_mode=self.scraping_mode,
|
||||
timing_config=self.timing_config
|
||||
)
|
||||
|
||||
# Perform login
|
||||
|
||||
Reference in New Issue
Block a user