- Created `install_and_run.bat` for Windows installation and setup. - Created `install_and_run.sh` for Unix-based systems installation and setup. - Removed `main.py` as it is no longer needed. - Updated `requirements.txt` to specify package versions and added PyQt5. - Deleted `start.bat` as it is redundant. - Added unit tests for core functionality and scraping modes. - Implemented input validation utilities in `utils/validators.py`. - Added support for dual scraping modes in the scraper.
319 lines
8.6 KiB
Python
319 lines
8.6 KiB
Python
"""
|
|
Input validation utilities for the EBoek.info scraper GUI.
|
|
"""
|
|
|
|
import re
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
def validate_page_range(start_page, end_page):
|
|
"""
|
|
Validate page range input.
|
|
|
|
Args:
|
|
start_page (int or str): Starting page number
|
|
end_page (int or str): Ending page number
|
|
|
|
Returns:
|
|
dict: Validation result with 'valid' bool, 'errors' list, and cleaned values
|
|
"""
|
|
errors = []
|
|
cleaned_start = None
|
|
cleaned_end = None
|
|
|
|
# Convert to integers
|
|
try:
|
|
cleaned_start = int(start_page)
|
|
except (ValueError, TypeError):
|
|
errors.append("Start page must be a valid number")
|
|
|
|
try:
|
|
cleaned_end = int(end_page)
|
|
except (ValueError, TypeError):
|
|
errors.append("End page must be a valid number")
|
|
|
|
# Validate range if both are valid numbers
|
|
if cleaned_start is not None and cleaned_end is not None:
|
|
if cleaned_start < 1:
|
|
errors.append("Start page must be 1 or greater")
|
|
if cleaned_end < 1:
|
|
errors.append("End page must be 1 or greater")
|
|
if cleaned_start > cleaned_end:
|
|
errors.append("Start page cannot be greater than end page")
|
|
if cleaned_end - cleaned_start > 100:
|
|
errors.append("Page range too large (maximum 100 pages at once)")
|
|
|
|
return {
|
|
'valid': len(errors) == 0,
|
|
'errors': errors,
|
|
'start_page': cleaned_start,
|
|
'end_page': cleaned_end
|
|
}
|
|
|
|
|
|
def validate_username(username):
|
|
"""
|
|
Validate EBoek.info username.
|
|
|
|
Args:
|
|
username (str): Username to validate
|
|
|
|
Returns:
|
|
dict: Validation result with 'valid' bool and 'errors' list
|
|
"""
|
|
errors = []
|
|
|
|
if not username:
|
|
errors.append("Username is required")
|
|
else:
|
|
username = username.strip()
|
|
if len(username) < 2:
|
|
errors.append("Username must be at least 2 characters long")
|
|
elif len(username) > 50:
|
|
errors.append("Username is too long (maximum 50 characters)")
|
|
elif not re.match(r'^[a-zA-Z0-9_.-]+$', username):
|
|
errors.append("Username contains invalid characters (use only letters, numbers, _, ., -)")
|
|
|
|
return {
|
|
'valid': len(errors) == 0,
|
|
'errors': errors,
|
|
'username': username.strip() if username else ""
|
|
}
|
|
|
|
|
|
def validate_password(password):
|
|
"""
|
|
Validate EBoek.info password.
|
|
|
|
Args:
|
|
password (str): Password to validate
|
|
|
|
Returns:
|
|
dict: Validation result with 'valid' bool and 'errors' list
|
|
"""
|
|
errors = []
|
|
|
|
if not password:
|
|
errors.append("Password is required")
|
|
else:
|
|
if len(password) < 3:
|
|
errors.append("Password must be at least 3 characters long")
|
|
elif len(password) > 128:
|
|
errors.append("Password is too long (maximum 128 characters)")
|
|
|
|
return {
|
|
'valid': len(errors) == 0,
|
|
'errors': errors
|
|
}
|
|
|
|
|
|
def validate_url(url):
|
|
"""
|
|
Validate URL format.
|
|
|
|
Args:
|
|
url (str): URL to validate
|
|
|
|
Returns:
|
|
dict: Validation result with 'valid' bool and 'errors' list
|
|
"""
|
|
errors = []
|
|
|
|
if not url:
|
|
errors.append("URL is required")
|
|
else:
|
|
try:
|
|
parsed = urlparse(url)
|
|
if not parsed.scheme:
|
|
errors.append("URL must include protocol (http:// or https://)")
|
|
elif parsed.scheme not in ['http', 'https']:
|
|
errors.append("URL must use http:// or https://")
|
|
if not parsed.netloc:
|
|
errors.append("URL must include domain name")
|
|
except Exception:
|
|
errors.append("Invalid URL format")
|
|
|
|
return {
|
|
'valid': len(errors) == 0,
|
|
'errors': errors,
|
|
'url': url.strip() if url else ""
|
|
}
|
|
|
|
|
|
def validate_file_path(file_path):
|
|
"""
|
|
Validate file path format.
|
|
|
|
Args:
|
|
file_path (str): File path to validate
|
|
|
|
Returns:
|
|
dict: Validation result with 'valid' bool and 'errors' list
|
|
"""
|
|
errors = []
|
|
|
|
if not file_path:
|
|
errors.append("File path is required")
|
|
else:
|
|
file_path = file_path.strip()
|
|
# Basic path validation - more specific validation would depend on OS
|
|
invalid_chars = ['<', '>', '|', '"', '*', '?']
|
|
for char in invalid_chars:
|
|
if char in file_path:
|
|
errors.append(f"File path contains invalid character: {char}")
|
|
break
|
|
|
|
if len(file_path) > 255:
|
|
errors.append("File path is too long (maximum 255 characters)")
|
|
|
|
return {
|
|
'valid': len(errors) == 0,
|
|
'errors': errors,
|
|
'path': file_path.strip() if file_path else ""
|
|
}
|
|
|
|
|
|
def sanitize_filename(filename):
|
|
"""
|
|
Sanitize a filename for safe storage.
|
|
|
|
Args:
|
|
filename (str): Original filename
|
|
|
|
Returns:
|
|
str: Sanitized filename safe for most file systems
|
|
"""
|
|
if not filename:
|
|
return "download"
|
|
|
|
# Replace invalid characters with underscores
|
|
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
|
|
sanitized = filename
|
|
for char in invalid_chars:
|
|
sanitized = sanitized.replace(char, '_')
|
|
|
|
# Remove leading/trailing whitespace and dots
|
|
sanitized = sanitized.strip(' .')
|
|
|
|
# Ensure it's not empty and not too long
|
|
if not sanitized:
|
|
sanitized = "download"
|
|
elif len(sanitized) > 200:
|
|
sanitized = sanitized[:200]
|
|
|
|
return sanitized
|
|
|
|
|
|
def validate_settings(settings):
|
|
"""
|
|
Validate application settings dictionary.
|
|
|
|
Args:
|
|
settings (dict): Settings to validate
|
|
|
|
Returns:
|
|
dict: Validation result with 'valid' bool, 'errors' list, and cleaned settings
|
|
"""
|
|
errors = []
|
|
cleaned_settings = {}
|
|
|
|
# Validate headless_mode
|
|
if 'headless_mode' in settings:
|
|
if isinstance(settings['headless_mode'], bool):
|
|
cleaned_settings['headless_mode'] = settings['headless_mode']
|
|
else:
|
|
errors.append("Headless mode must be true or false")
|
|
|
|
# Validate verbose_logging
|
|
if 'verbose_logging' in settings:
|
|
if isinstance(settings['verbose_logging'], bool):
|
|
cleaned_settings['verbose_logging'] = settings['verbose_logging']
|
|
else:
|
|
errors.append("Verbose logging must be true or false")
|
|
|
|
# Validate auto_save_credentials
|
|
if 'auto_save_credentials' in settings:
|
|
if isinstance(settings['auto_save_credentials'], bool):
|
|
cleaned_settings['auto_save_credentials'] = settings['auto_save_credentials']
|
|
else:
|
|
errors.append("Auto save credentials must be true or false")
|
|
|
|
# Validate download_path
|
|
if 'download_path' in settings:
|
|
path_validation = validate_file_path(settings['download_path'])
|
|
if path_validation['valid']:
|
|
cleaned_settings['download_path'] = path_validation['path']
|
|
else:
|
|
errors.extend(path_validation['errors'])
|
|
|
|
# Validate default pages
|
|
for page_key in ['default_start_page', 'default_end_page']:
|
|
if page_key in settings:
|
|
try:
|
|
page_num = int(settings[page_key])
|
|
if page_num < 1:
|
|
errors.append(f"{page_key.replace('_', ' ').title()} must be 1 or greater")
|
|
else:
|
|
cleaned_settings[page_key] = page_num
|
|
except (ValueError, TypeError):
|
|
errors.append(f"{page_key.replace('_', ' ').title()} must be a valid number")
|
|
|
|
return {
|
|
'valid': len(errors) == 0,
|
|
'errors': errors,
|
|
'settings': cleaned_settings
|
|
}
|
|
|
|
|
|
def format_error_message(errors):
|
|
"""
|
|
Format a list of error messages into a user-friendly string.
|
|
|
|
Args:
|
|
errors (list): List of error messages
|
|
|
|
Returns:
|
|
str: Formatted error message
|
|
"""
|
|
if not errors:
|
|
return ""
|
|
elif len(errors) == 1:
|
|
return errors[0]
|
|
else:
|
|
return "Multiple errors:\n• " + "\n• ".join(errors)
|
|
|
|
|
|
def is_safe_string(text, max_length=1000):
|
|
"""
|
|
Check if a string is safe for display/storage (no dangerous content).
|
|
|
|
Args:
|
|
text (str): Text to check
|
|
max_length (int): Maximum allowed length
|
|
|
|
Returns:
|
|
bool: True if string is safe, False otherwise
|
|
"""
|
|
if not isinstance(text, str):
|
|
return False
|
|
|
|
if len(text) > max_length:
|
|
return False
|
|
|
|
# Check for potential script injection or dangerous content
|
|
dangerous_patterns = [
|
|
'<script',
|
|
'javascript:',
|
|
'data:',
|
|
'vbscript:',
|
|
'<?php',
|
|
'<%',
|
|
'${',
|
|
]
|
|
|
|
text_lower = text.lower()
|
|
for pattern in dangerous_patterns:
|
|
if pattern in text_lower:
|
|
return False
|
|
|
|
return True |