- Created `install_and_run.bat` for Windows installation and setup. - Created `install_and_run.sh` for Unix-based systems installation and setup. - Removed `main.py` as it is no longer needed. - Updated `requirements.txt` to specify package versions and added PyQt5. - Deleted `start.bat` as it is redundant. - Added unit tests for core functionality and scraping modes. - Implemented input validation utilities in `utils/validators.py`. - Added support for dual scraping modes in the scraper.
477 lines
19 KiB
Python
477 lines
19 KiB
Python
"""
|
|
Progress dialog for real-time scraping progress monitoring.
|
|
"""
|
|
|
|
import time
|
|
from PyQt5.QtWidgets import (
|
|
QDialog, QVBoxLayout, QHBoxLayout, QGridLayout,
|
|
QPushButton, QLabel, QProgressBar, QTextEdit, QGroupBox
|
|
)
|
|
from PyQt5.QtCore import Qt, QTimer, pyqtSignal
|
|
from PyQt5.QtGui import QFont
|
|
|
|
from pathlib import Path
|
|
import sys
|
|
|
|
# Add the project root directory to Python path
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
|
|
class ProgressDialog(QDialog):
|
|
"""
|
|
Dialog for displaying real-time scraping progress.
|
|
|
|
Shows progress bars for pages and comics, current activity status,
|
|
and a detailed log of operations with the ability to cancel.
|
|
"""
|
|
|
|
# Signals
|
|
cancel_requested = pyqtSignal()
|
|
|
|
def __init__(self, parent=None, scraper_thread=None):
|
|
super().__init__(parent)
|
|
self.scraper_thread = scraper_thread
|
|
self.start_time = time.time()
|
|
|
|
# Progress tracking
|
|
self.total_pages = 0
|
|
self.current_page = 0
|
|
self.total_comics_on_page = 0
|
|
self.current_comic = 0
|
|
self.total_comics_processed = 0
|
|
self.total_downloads_triggered = 0
|
|
|
|
# Enhanced time tracking for better estimation
|
|
self.comic_start_times = [] # Track start time of each comic
|
|
self.comic_durations = [] # Track how long each comic took
|
|
self.estimated_total_comics = 0 # Estimated total comics across all pages
|
|
self.last_comic_start = None
|
|
self.pages_processed = 0
|
|
|
|
self.init_ui()
|
|
self.connect_signals()
|
|
|
|
def init_ui(self):
|
|
"""Initialize the user interface."""
|
|
self.setWindowTitle("Scraping Progress")
|
|
self.setMinimumSize(500, 400)
|
|
self.resize(600, 500)
|
|
|
|
layout = QVBoxLayout(self)
|
|
|
|
# Overall progress section
|
|
self.create_overall_progress_section(layout)
|
|
|
|
# Current activity section
|
|
self.create_activity_section(layout)
|
|
|
|
# Progress details section
|
|
self.create_details_section(layout)
|
|
|
|
# Log section
|
|
self.create_log_section(layout)
|
|
|
|
# Control buttons
|
|
self.create_control_section(layout)
|
|
|
|
def create_overall_progress_section(self, parent_layout):
|
|
"""Create the overall progress section."""
|
|
group = QGroupBox("Overall Progress")
|
|
layout = QVBoxLayout(group)
|
|
|
|
# Page progress
|
|
self.page_progress_label = QLabel("Initializing...")
|
|
layout.addWidget(self.page_progress_label)
|
|
|
|
self.page_progress_bar = QProgressBar()
|
|
self.page_progress_bar.setRange(0, 100)
|
|
layout.addWidget(self.page_progress_bar)
|
|
|
|
# Comic progress (current page)
|
|
self.comic_progress_label = QLabel("Waiting for page data...")
|
|
layout.addWidget(self.comic_progress_label)
|
|
|
|
self.comic_progress_bar = QProgressBar()
|
|
self.comic_progress_bar.setRange(0, 100)
|
|
layout.addWidget(self.comic_progress_bar)
|
|
|
|
parent_layout.addWidget(group)
|
|
|
|
def create_activity_section(self, parent_layout):
|
|
"""Create the current activity section."""
|
|
group = QGroupBox("Current Activity")
|
|
layout = QVBoxLayout(group)
|
|
|
|
self.activity_label = QLabel("Starting scraper...")
|
|
self.activity_label.setStyleSheet("font-weight: bold; color: #2E8B57;")
|
|
layout.addWidget(self.activity_label)
|
|
|
|
# Current item details
|
|
self.current_item_label = QLabel("")
|
|
layout.addWidget(self.current_item_label)
|
|
|
|
parent_layout.addWidget(group)
|
|
|
|
def create_details_section(self, parent_layout):
|
|
"""Create the progress details section."""
|
|
group = QGroupBox("Session Details")
|
|
layout = QGridLayout(group)
|
|
|
|
# Time information
|
|
layout.addWidget(QLabel("Time Elapsed:"), 0, 0)
|
|
self.elapsed_time_label = QLabel("00:00:00")
|
|
layout.addWidget(self.elapsed_time_label, 0, 1)
|
|
|
|
layout.addWidget(QLabel("Estimated Remaining:"), 0, 2)
|
|
self.remaining_time_label = QLabel("Calculating...")
|
|
layout.addWidget(self.remaining_time_label, 0, 3)
|
|
|
|
# Progress statistics
|
|
layout.addWidget(QLabel("Comics Processed:"), 1, 0)
|
|
self.comics_processed_label = QLabel("0")
|
|
layout.addWidget(self.comics_processed_label, 1, 1)
|
|
|
|
layout.addWidget(QLabel("Downloads Triggered:"), 1, 2)
|
|
self.downloads_triggered_label = QLabel("0")
|
|
layout.addWidget(self.downloads_triggered_label, 1, 3)
|
|
|
|
parent_layout.addWidget(group)
|
|
|
|
# Start timer for elapsed time updates
|
|
self.timer = QTimer()
|
|
self.timer.timeout.connect(self.update_elapsed_time)
|
|
self.timer.start(1000) # Update every second
|
|
|
|
def create_log_section(self, parent_layout):
|
|
"""Create the log display section."""
|
|
group = QGroupBox("Activity Log")
|
|
layout = QVBoxLayout(group)
|
|
|
|
self.log_text = QTextEdit()
|
|
self.log_text.setReadOnly(True)
|
|
self.log_text.setMaximumHeight(150)
|
|
|
|
# Set monospace font for logs (cross-platform)
|
|
font = QFont()
|
|
font.setFamily("Monaco, Consolas, 'Courier New', monospace") # Cross-platform fallback
|
|
font.setPointSize(9)
|
|
font.setStyleHint(QFont.TypeWriter) # Monospace hint
|
|
self.log_text.setFont(font)
|
|
|
|
layout.addWidget(self.log_text)
|
|
|
|
parent_layout.addWidget(group)
|
|
|
|
def create_control_section(self, parent_layout):
|
|
"""Create the control buttons section."""
|
|
layout = QHBoxLayout()
|
|
|
|
layout.addStretch()
|
|
|
|
self.cancel_btn = QPushButton("Cancel Operation")
|
|
self.cancel_btn.clicked.connect(self.cancel_scraping)
|
|
self.cancel_btn.setStyleSheet("QPushButton { background-color: #f44336; color: white; font-weight: bold; padding: 8px; }")
|
|
layout.addWidget(self.cancel_btn)
|
|
|
|
self.close_btn = QPushButton("Close")
|
|
self.close_btn.clicked.connect(self.accept)
|
|
self.close_btn.setEnabled(False) # Enabled when scraping completes
|
|
layout.addWidget(self.close_btn)
|
|
|
|
parent_layout.addLayout(layout)
|
|
|
|
def connect_signals(self):
|
|
"""Connect signals from the scraper thread."""
|
|
if not self.scraper_thread:
|
|
return
|
|
|
|
# Login signals
|
|
self.scraper_thread.login_started.connect(self.on_login_started)
|
|
self.scraper_thread.login_success.connect(self.on_login_success)
|
|
self.scraper_thread.login_failed.connect(self.on_login_failed)
|
|
|
|
# Scraping progress
|
|
self.scraper_thread.scraping_started.connect(self.on_scraping_started)
|
|
self.scraper_thread.scraping_completed.connect(self.on_scraping_completed)
|
|
|
|
# Page progress
|
|
self.scraper_thread.page_started.connect(self.on_page_started)
|
|
self.scraper_thread.page_completed.connect(self.on_page_completed)
|
|
self.scraper_thread.page_comics_found.connect(self.on_page_comics_found)
|
|
|
|
# Comic progress
|
|
self.scraper_thread.comic_started.connect(self.on_comic_started)
|
|
self.scraper_thread.comic_completed.connect(self.on_comic_completed)
|
|
self.scraper_thread.comic_title_extracted.connect(self.on_comic_title_extracted)
|
|
|
|
# Download progress
|
|
self.scraper_thread.download_links_found.connect(self.on_download_links_found)
|
|
self.scraper_thread.download_started.connect(self.on_download_started)
|
|
self.scraper_thread.download_triggered.connect(self.on_download_triggered)
|
|
|
|
# General status
|
|
self.scraper_thread.status_update.connect(self.log_message)
|
|
self.scraper_thread.error_occurred.connect(self.on_error_occurred)
|
|
|
|
# Timing events
|
|
self.scraper_thread.page_break_started.connect(self.on_break_started)
|
|
self.scraper_thread.comic_batch_break.connect(self.on_break_started)
|
|
|
|
def cancel_scraping(self):
|
|
"""Cancel the scraping operation."""
|
|
if self.scraper_thread:
|
|
self.log_message("Cancel requested - stopping after current operation...")
|
|
self.scraper_thread.request_stop()
|
|
self.cancel_btn.setEnabled(False)
|
|
self.activity_label.setText("Cancelling...")
|
|
self.activity_label.setStyleSheet("font-weight: bold; color: #FF6B35;")
|
|
|
|
def log_message(self, message):
|
|
"""Add a message to the log."""
|
|
import datetime
|
|
timestamp = datetime.datetime.now().strftime("%H:%M:%S")
|
|
formatted_message = f"[{timestamp}] {message}"
|
|
|
|
self.log_text.append(formatted_message)
|
|
|
|
# Auto-scroll to bottom
|
|
scrollbar = self.log_text.verticalScrollBar()
|
|
scrollbar.setValue(scrollbar.maximum())
|
|
|
|
def update_elapsed_time(self):
|
|
"""Update the elapsed time display with enhanced estimation."""
|
|
elapsed = int(time.time() - self.start_time)
|
|
hours = elapsed // 3600
|
|
minutes = (elapsed % 3600) // 60
|
|
seconds = elapsed % 60
|
|
|
|
self.elapsed_time_label.setText(f"{hours:02d}:{minutes:02d}:{seconds:02d}")
|
|
|
|
# Enhanced time estimation based on comic processing rate
|
|
self.calculate_realistic_time_estimate(elapsed)
|
|
|
|
def calculate_realistic_time_estimate(self, elapsed):
|
|
"""Calculate realistic time estimate based on comic processing data."""
|
|
try:
|
|
# If we have comic duration data, use it for accurate estimation
|
|
if len(self.comic_durations) >= 2 and self.estimated_total_comics > 0:
|
|
# Calculate average time per comic from actual data
|
|
avg_time_per_comic = sum(self.comic_durations) / len(self.comic_durations)
|
|
comics_remaining = self.estimated_total_comics - self.total_comics_processed
|
|
|
|
if comics_remaining > 0:
|
|
estimated_remaining = int(comics_remaining * avg_time_per_comic)
|
|
self.format_remaining_time(estimated_remaining)
|
|
else:
|
|
self.remaining_time_label.setText("Almost done!")
|
|
|
|
# Comic-based estimation when we know total comics but don't have enough duration data
|
|
elif self.estimated_total_comics > 0 and self.total_comics_processed > 0:
|
|
# Use current processing rate
|
|
avg_time_per_comic = elapsed / self.total_comics_processed
|
|
comics_remaining = self.estimated_total_comics - self.total_comics_processed
|
|
|
|
if comics_remaining > 0:
|
|
estimated_remaining = int(comics_remaining * avg_time_per_comic)
|
|
self.format_remaining_time(estimated_remaining)
|
|
else:
|
|
self.remaining_time_label.setText("Almost done!")
|
|
|
|
# Fallback to combined page + comic estimation
|
|
elif self.total_pages > 0 and self.total_comics_on_page > 0:
|
|
# Calculate combined progress: pages completed + current page comic progress
|
|
pages_completed = self.current_page - 1
|
|
current_page_progress = self.current_comic / self.total_comics_on_page
|
|
total_progress = (pages_completed + current_page_progress) / self.total_pages
|
|
|
|
if total_progress > 0.05: # Only estimate after 5% progress
|
|
estimated_total = elapsed / total_progress
|
|
remaining = int(estimated_total - elapsed)
|
|
if remaining > 0:
|
|
self.format_remaining_time(remaining)
|
|
else:
|
|
self.remaining_time_label.setText("Almost done!")
|
|
else:
|
|
self.remaining_time_label.setText("Calculating...")
|
|
else:
|
|
self.remaining_time_label.setText("Calculating...")
|
|
|
|
except (ZeroDivisionError, ValueError):
|
|
self.remaining_time_label.setText("Calculating...")
|
|
|
|
def format_remaining_time(self, remaining_seconds):
|
|
"""Format remaining time into readable format."""
|
|
if remaining_seconds <= 0:
|
|
self.remaining_time_label.setText("Almost done!")
|
|
return
|
|
|
|
rem_hours = remaining_seconds // 3600
|
|
rem_minutes = (remaining_seconds % 3600) // 60
|
|
rem_secs = remaining_seconds % 60
|
|
|
|
# Show different formats based on duration
|
|
if rem_hours > 0:
|
|
self.remaining_time_label.setText(f"{rem_hours:02d}:{rem_minutes:02d}:{rem_secs:02d}")
|
|
elif rem_minutes > 0:
|
|
self.remaining_time_label.setText(f"{rem_minutes:02d}:{rem_secs:02d}")
|
|
else:
|
|
self.remaining_time_label.setText(f"{rem_secs} sec")
|
|
|
|
def update_progress_bars(self):
|
|
"""Update progress bars based on current state."""
|
|
# Page progress
|
|
if self.total_pages > 0:
|
|
page_progress = int((self.current_page / self.total_pages) * 100)
|
|
self.page_progress_bar.setValue(page_progress)
|
|
self.page_progress_label.setText(f"Page {self.current_page} of {self.total_pages} ({page_progress}%)")
|
|
|
|
# Comic progress
|
|
if self.total_comics_on_page > 0:
|
|
comic_progress = int((self.current_comic / self.total_comics_on_page) * 100)
|
|
self.comic_progress_bar.setValue(comic_progress)
|
|
self.comic_progress_label.setText(f"Comic {self.current_comic} of {self.total_comics_on_page} ({comic_progress}%)")
|
|
|
|
def update_statistics(self):
|
|
"""Update the statistics display."""
|
|
self.comics_processed_label.setText(str(self.total_comics_processed))
|
|
self.downloads_triggered_label.setText(str(self.total_downloads_triggered))
|
|
|
|
# Event handlers
|
|
def on_login_started(self, username):
|
|
"""Handle login started."""
|
|
self.activity_label.setText(f"Logging in as {username}...")
|
|
self.log_message(f"Logging in as {username}")
|
|
|
|
def on_login_success(self, username):
|
|
"""Handle successful login."""
|
|
self.activity_label.setText("Login successful - starting scraper...")
|
|
self.log_message(f"Login successful for {username}")
|
|
|
|
def on_login_failed(self, username, error):
|
|
"""Handle failed login."""
|
|
self.activity_label.setText("Login failed")
|
|
self.activity_label.setStyleSheet("font-weight: bold; color: #f44336;")
|
|
self.log_message(f"Login failed: {error}")
|
|
|
|
def on_scraping_started(self, start_page, end_page, total_pages):
|
|
"""Handle scraping start."""
|
|
self.total_pages = total_pages
|
|
self.current_page = 0
|
|
self.activity_label.setText(f"Starting scraping: pages {start_page} to {end_page}")
|
|
self.log_message(f"Starting scraping: pages {start_page} to {end_page}")
|
|
self.update_progress_bars()
|
|
|
|
def on_scraping_completed(self, summary):
|
|
"""Handle scraping completion."""
|
|
self.cancel_btn.setEnabled(False)
|
|
self.close_btn.setEnabled(True)
|
|
self.timer.stop()
|
|
|
|
if summary.get('cancelled'):
|
|
self.activity_label.setText("Scraping cancelled by user")
|
|
self.activity_label.setStyleSheet("font-weight: bold; color: #FF6B35;")
|
|
elif summary.get('success'):
|
|
self.activity_label.setText("Scraping completed successfully!")
|
|
self.activity_label.setStyleSheet("font-weight: bold; color: #2E8B57;")
|
|
else:
|
|
self.activity_label.setText("Scraping completed with errors")
|
|
self.activity_label.setStyleSheet("font-weight: bold; color: #f44336;")
|
|
|
|
# Update final statistics
|
|
self.total_comics_processed = summary.get('total_comics_processed', 0)
|
|
self.total_downloads_triggered = summary.get('total_downloads_triggered', 0)
|
|
self.update_statistics()
|
|
|
|
self.log_message("Scraping operation completed")
|
|
|
|
def on_page_started(self, page_number, page_index, total_pages, url):
|
|
"""Handle page start."""
|
|
self.current_page = page_index
|
|
self.current_comic = 0
|
|
self.total_comics_on_page = 0
|
|
self.activity_label.setText(f"Processing page {page_number}...")
|
|
self.current_item_label.setText(f"URL: {url}")
|
|
self.update_progress_bars()
|
|
self.log_message(f"Started processing page {page_number}")
|
|
|
|
def on_page_completed(self, page_number, comics_processed):
|
|
"""Handle page completion."""
|
|
self.pages_processed = self.current_page
|
|
self.log_message(f"Completed page {page_number} - {comics_processed} comics processed")
|
|
|
|
def on_page_comics_found(self, page_number, comic_count):
|
|
"""Handle comics found on page."""
|
|
self.total_comics_on_page = comic_count
|
|
self.current_comic = 0
|
|
|
|
# Update estimated total comics based on current data
|
|
if self.total_pages > 0 and self.current_page > 0:
|
|
avg_comics_per_page = (self.total_comics_processed + comic_count) / self.current_page
|
|
self.estimated_total_comics = int(avg_comics_per_page * self.total_pages)
|
|
|
|
self.log_message(f"Found {comic_count} comics on page {page_number}")
|
|
self.update_progress_bars()
|
|
|
|
def on_comic_started(self, page_number, comic_index, total_comics, url):
|
|
"""Handle comic start."""
|
|
self.current_comic = comic_index
|
|
self.last_comic_start = time.time() # Track start time for duration calculation
|
|
self.activity_label.setText(f"Processing comic {comic_index} of {total_comics}...")
|
|
self.current_item_label.setText(f"URL: {url}")
|
|
self.update_progress_bars()
|
|
|
|
def on_comic_completed(self, title, downloads_triggered, page_number, comic_index):
|
|
"""Handle comic completion."""
|
|
# Track timing data for enhanced estimation
|
|
if self.last_comic_start is not None:
|
|
comic_duration = time.time() - self.last_comic_start
|
|
self.comic_durations.append(comic_duration)
|
|
# Keep only recent durations for adaptive estimation (last 20 comics)
|
|
if len(self.comic_durations) > 20:
|
|
self.comic_durations = self.comic_durations[-20:]
|
|
|
|
# Update live counters
|
|
self.total_comics_processed += 1
|
|
# Note: downloads_triggered counter is now updated in real-time in on_download_triggered
|
|
self.update_statistics() # This updates the live display
|
|
self.log_message(f"Completed: {title} ({downloads_triggered} downloads)")
|
|
|
|
def on_comic_title_extracted(self, title, url):
|
|
"""Handle comic title extraction."""
|
|
self.current_item_label.setText(f"Processing: {title}")
|
|
|
|
def on_download_links_found(self, title, download_count):
|
|
"""Handle download links found."""
|
|
self.log_message(f"Found {download_count} download links for: {title}")
|
|
|
|
def on_download_started(self, file_name, url, index, total):
|
|
"""Handle download start."""
|
|
self.activity_label.setText(f"Downloading file {index} of {total}")
|
|
self.current_item_label.setText(f"File: {file_name}")
|
|
|
|
def on_download_triggered(self, url):
|
|
"""Handle download triggered."""
|
|
# Update download counter in real-time
|
|
self.total_downloads_triggered += 1
|
|
self.update_statistics()
|
|
|
|
def on_error_occurred(self, error_message):
|
|
"""Handle error."""
|
|
self.log_message(f"ERROR: {error_message}")
|
|
|
|
def on_break_started(self, duration, context=None):
|
|
"""Handle break start."""
|
|
self.activity_label.setText(f"Taking a break for {duration:.1f} seconds...")
|
|
self.current_item_label.setText("Human-like delay in progress...")
|
|
|
|
def closeEvent(self, event):
|
|
"""Handle dialog close."""
|
|
if self.scraper_thread and self.scraper_thread.isRunning():
|
|
# Don't allow closing while scraping is active
|
|
event.ignore()
|
|
else:
|
|
# Stop timer
|
|
if hasattr(self, 'timer'):
|
|
self.timer.stop()
|
|
event.accept() |