Spaces:
Running
Running
File size: 5,001 Bytes
7f22d3c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 |
"""
工具函数模块:URL处理、内容哈希、熵值计算等
"""
import re
import hashlib
import math
from urllib.parse import urljoin, urlparse
from typing import Optional, Set
def normalize_url(url: str) -> Optional[str]:
"""
规范化URL:移除fragment,处理相对路径等
Args:
url: 原始URL
Returns:
规范化后的URL,如果无效则返回None
"""
if not url:
return None
# 移除fragment
url = url.split('#')[0].strip()
# 解析并重建URL
parsed = urlparse(url)
if not parsed.scheme or not parsed.netloc:
return None
# 规范化路径(移除./和../)
path = parsed.path
if path:
parts = path.split('/')
normalized_parts = []
for part in parts:
if part == '..':
if normalized_parts:
normalized_parts.pop()
elif part and part != '.':
normalized_parts.append(part)
path = '/' + '/'.join(normalized_parts)
# 重建URL
normalized = f"{parsed.scheme}://{parsed.netloc}{path}"
if parsed.query:
normalized += f"?{parsed.query}"
return normalized
def is_valid_url(url: str, max_length: int = 2048) -> bool:
"""
验证URL是否有效
Args:
url: URL字符串
max_length: URL最大长度
Returns:
是否有效
"""
if not url or len(url) > max_length:
return False
parsed = urlparse(url)
# 只接受http和https
if parsed.scheme not in ['http', 'https']:
return False
# 过滤无效协议
if url.lower().startswith(('javascript:', 'mailto:', 'tel:', 'data:', 'file:')):
return False
return True
def calculate_shannon_entropy(text: str) -> float:
"""
计算文本的香农熵 (Shannon Entropy)
Args:
text: 输入文本
Returns:
熵值
"""
if not text:
return 0
prob = [float(text.count(c)) / len(text) for c in dict.fromkeys(list(text))]
entropy = -sum([p * math.log(p) / math.log(2.0) for p in prob if p > 0])
return entropy
def is_valid_text(text: str, min_length: int = 30, min_entropy: float = 3.5, max_entropy: float = 6.5) -> tuple[bool, str]:
"""
检查文本是否有效(基于长度和熵值)
Args:
text: 文本内容
min_length: 最小长度
min_entropy: 最小熵值
max_entropy: 最大熵值
Returns:
(是否有效, 原因)
"""
if len(text) < min_length:
return False, "Too Short"
entropy = calculate_shannon_entropy(text)
if entropy < min_entropy:
return False, f"Low Entropy ({entropy:.2f}) - Likely menu/nav/ad"
if entropy > max_entropy:
return False, f"High Entropy ({entropy:.2f}) - Likely code/hash"
return True, str(entropy)
def content_hash(text: str) -> str:
"""
计算内容的MD5哈希值(用于去重)
Args:
text: 文本内容
Returns:
MD5哈希值(十六进制字符串)
"""
return hashlib.md5(text.encode('utf-8')).hexdigest()
def absolute_url(base_url: str, href: str) -> Optional[str]:
"""
将相对URL转换为绝对URL
Args:
base_url: 基础URL
href: 相对或绝对URL
Returns:
绝对URL,如果无效则返回None
"""
if not href:
return None
try:
absolute = urljoin(base_url, href)
return normalize_url(absolute)
except Exception:
return None
def get_domain(url: str) -> Optional[str]:
"""
提取URL的域名
Args:
url: URL字符串
Returns:
域名,如果无效则返回None
"""
try:
parsed = urlparse(url)
return parsed.netloc
except Exception:
return None
def filter_static_extensions(url: str, exclude_extensions: Optional[Set[str]] = None) -> bool:
"""
检查URL是否为静态资源
Args:
url: URL字符串
exclude_extensions: 要排除的扩展名集合
Returns:
如果是静态资源返回True,否则返回False
"""
if exclude_extensions is None:
exclude_extensions = {
'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp',
'.css', '.js', '.ico', '.zip', '.tar', '.gz', '.rar',
'.mp4', '.mp3', '.avi', '.mov', '.wmv', '.flv',
'.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'
}
url_lower = url.lower()
for ext in exclude_extensions:
if url_lower.endswith(ext):
return True
# 检查静态路径模式
static_patterns = ['/static/', '/assets/', '/media/', '/images/', '/css/', '/js/']
for pattern in static_patterns:
if pattern in url_lower:
return True
return False
|