eboek.info-scraper/utils/validators.py
Louis Mylle ea4cab15c3 feat: Add installation scripts for Windows and Unix-based systems
- Created `install_and_run.bat` for Windows installation and setup.
- Created `install_and_run.sh` for Unix-based systems installation and setup.
- Removed `main.py` as it is no longer needed.
- Updated `requirements.txt` to specify package versions and added PyQt5.
- Deleted `start.bat` as it is redundant.
- Added unit tests for core functionality and scraping modes.
- Implemented input validation utilities in `utils/validators.py`.
- Added support for dual scraping modes in the scraper.
2026-01-10 14:45:00 +01:00

319 lines
8.6 KiB
Python

"""
Input validation utilities for the EBoek.info scraper GUI.
"""
import re
from urllib.parse import urlparse
def validate_page_range(start_page, end_page):
"""
Validate page range input.
Args:
start_page (int or str): Starting page number
end_page (int or str): Ending page number
Returns:
dict: Validation result with 'valid' bool, 'errors' list, and cleaned values
"""
errors = []
cleaned_start = None
cleaned_end = None
# Convert to integers
try:
cleaned_start = int(start_page)
except (ValueError, TypeError):
errors.append("Start page must be a valid number")
try:
cleaned_end = int(end_page)
except (ValueError, TypeError):
errors.append("End page must be a valid number")
# Validate range if both are valid numbers
if cleaned_start is not None and cleaned_end is not None:
if cleaned_start < 1:
errors.append("Start page must be 1 or greater")
if cleaned_end < 1:
errors.append("End page must be 1 or greater")
if cleaned_start > cleaned_end:
errors.append("Start page cannot be greater than end page")
if cleaned_end - cleaned_start > 100:
errors.append("Page range too large (maximum 100 pages at once)")
return {
'valid': len(errors) == 0,
'errors': errors,
'start_page': cleaned_start,
'end_page': cleaned_end
}
def validate_username(username):
"""
Validate EBoek.info username.
Args:
username (str): Username to validate
Returns:
dict: Validation result with 'valid' bool and 'errors' list
"""
errors = []
if not username:
errors.append("Username is required")
else:
username = username.strip()
if len(username) < 2:
errors.append("Username must be at least 2 characters long")
elif len(username) > 50:
errors.append("Username is too long (maximum 50 characters)")
elif not re.match(r'^[a-zA-Z0-9_.-]+$', username):
errors.append("Username contains invalid characters (use only letters, numbers, _, ., -)")
return {
'valid': len(errors) == 0,
'errors': errors,
'username': username.strip() if username else ""
}
def validate_password(password):
"""
Validate EBoek.info password.
Args:
password (str): Password to validate
Returns:
dict: Validation result with 'valid' bool and 'errors' list
"""
errors = []
if not password:
errors.append("Password is required")
else:
if len(password) < 3:
errors.append("Password must be at least 3 characters long")
elif len(password) > 128:
errors.append("Password is too long (maximum 128 characters)")
return {
'valid': len(errors) == 0,
'errors': errors
}
def validate_url(url):
"""
Validate URL format.
Args:
url (str): URL to validate
Returns:
dict: Validation result with 'valid' bool and 'errors' list
"""
errors = []
if not url:
errors.append("URL is required")
else:
try:
parsed = urlparse(url)
if not parsed.scheme:
errors.append("URL must include protocol (http:// or https://)")
elif parsed.scheme not in ['http', 'https']:
errors.append("URL must use http:// or https://")
if not parsed.netloc:
errors.append("URL must include domain name")
except Exception:
errors.append("Invalid URL format")
return {
'valid': len(errors) == 0,
'errors': errors,
'url': url.strip() if url else ""
}
def validate_file_path(file_path):
"""
Validate file path format.
Args:
file_path (str): File path to validate
Returns:
dict: Validation result with 'valid' bool and 'errors' list
"""
errors = []
if not file_path:
errors.append("File path is required")
else:
file_path = file_path.strip()
# Basic path validation - more specific validation would depend on OS
invalid_chars = ['<', '>', '|', '"', '*', '?']
for char in invalid_chars:
if char in file_path:
errors.append(f"File path contains invalid character: {char}")
break
if len(file_path) > 255:
errors.append("File path is too long (maximum 255 characters)")
return {
'valid': len(errors) == 0,
'errors': errors,
'path': file_path.strip() if file_path else ""
}
def sanitize_filename(filename):
"""
Sanitize a filename for safe storage.
Args:
filename (str): Original filename
Returns:
str: Sanitized filename safe for most file systems
"""
if not filename:
return "download"
# Replace invalid characters with underscores
invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
sanitized = filename
for char in invalid_chars:
sanitized = sanitized.replace(char, '_')
# Remove leading/trailing whitespace and dots
sanitized = sanitized.strip(' .')
# Ensure it's not empty and not too long
if not sanitized:
sanitized = "download"
elif len(sanitized) > 200:
sanitized = sanitized[:200]
return sanitized
def validate_settings(settings):
"""
Validate application settings dictionary.
Args:
settings (dict): Settings to validate
Returns:
dict: Validation result with 'valid' bool, 'errors' list, and cleaned settings
"""
errors = []
cleaned_settings = {}
# Validate headless_mode
if 'headless_mode' in settings:
if isinstance(settings['headless_mode'], bool):
cleaned_settings['headless_mode'] = settings['headless_mode']
else:
errors.append("Headless mode must be true or false")
# Validate verbose_logging
if 'verbose_logging' in settings:
if isinstance(settings['verbose_logging'], bool):
cleaned_settings['verbose_logging'] = settings['verbose_logging']
else:
errors.append("Verbose logging must be true or false")
# Validate auto_save_credentials
if 'auto_save_credentials' in settings:
if isinstance(settings['auto_save_credentials'], bool):
cleaned_settings['auto_save_credentials'] = settings['auto_save_credentials']
else:
errors.append("Auto save credentials must be true or false")
# Validate download_path
if 'download_path' in settings:
path_validation = validate_file_path(settings['download_path'])
if path_validation['valid']:
cleaned_settings['download_path'] = path_validation['path']
else:
errors.extend(path_validation['errors'])
# Validate default pages
for page_key in ['default_start_page', 'default_end_page']:
if page_key in settings:
try:
page_num = int(settings[page_key])
if page_num < 1:
errors.append(f"{page_key.replace('_', ' ').title()} must be 1 or greater")
else:
cleaned_settings[page_key] = page_num
except (ValueError, TypeError):
errors.append(f"{page_key.replace('_', ' ').title()} must be a valid number")
return {
'valid': len(errors) == 0,
'errors': errors,
'settings': cleaned_settings
}
def format_error_message(errors):
"""
Format a list of error messages into a user-friendly string.
Args:
errors (list): List of error messages
Returns:
str: Formatted error message
"""
if not errors:
return ""
elif len(errors) == 1:
return errors[0]
else:
return "Multiple errors:\n" + "\n".join(errors)
def is_safe_string(text, max_length=1000):
"""
Check if a string is safe for display/storage (no dangerous content).
Args:
text (str): Text to check
max_length (int): Maximum allowed length
Returns:
bool: True if string is safe, False otherwise
"""
if not isinstance(text, str):
return False
if len(text) > max_length:
return False
# Check for potential script injection or dangerous content
dangerous_patterns = [
'<script',
'javascript:',
'data:',
'vbscript:',
'<?php',
'<%',
'${',
]
text_lower = text.lower()
for pattern in dangerous_patterns:
if pattern in text_lower:
return False
return True