File size: 5,001 Bytes
7f22d3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
"""
工具函数模块:URL处理、内容哈希、熵值计算等
"""
import re
import hashlib
import math
from urllib.parse import urljoin, urlparse
from typing import Optional, Set


def normalize_url(url: str) -> Optional[str]:
    """
    规范化URL:移除fragment,处理相对路径等
    
    Args:
        url: 原始URL
        
    Returns:
        规范化后的URL,如果无效则返回None
    """
    if not url:
        return None
    
    # 移除fragment
    url = url.split('#')[0].strip()
    
    # 解析并重建URL
    parsed = urlparse(url)
    if not parsed.scheme or not parsed.netloc:
        return None
    
    # 规范化路径(移除./和../)
    path = parsed.path
    if path:
        parts = path.split('/')
        normalized_parts = []
        for part in parts:
            if part == '..':
                if normalized_parts:
                    normalized_parts.pop()
            elif part and part != '.':
                normalized_parts.append(part)
        path = '/' + '/'.join(normalized_parts)
    
    # 重建URL
    normalized = f"{parsed.scheme}://{parsed.netloc}{path}"
    if parsed.query:
        normalized += f"?{parsed.query}"
    
    return normalized


def is_valid_url(url: str, max_length: int = 2048) -> bool:
    """
    验证URL是否有效
    
    Args:
        url: URL字符串
        max_length: URL最大长度
        
    Returns:
        是否有效
    """
    if not url or len(url) > max_length:
        return False
    
    parsed = urlparse(url)
    # 只接受http和https
    if parsed.scheme not in ['http', 'https']:
        return False
    
    # 过滤无效协议
    if url.lower().startswith(('javascript:', 'mailto:', 'tel:', 'data:', 'file:')):
        return False
    
    return True


def calculate_shannon_entropy(text: str) -> float:
    """
    计算文本的香农熵 (Shannon Entropy)
    
    Args:
        text: 输入文本
        
    Returns:
        熵值
    """
    if not text:
        return 0
    
    prob = [float(text.count(c)) / len(text) for c in dict.fromkeys(list(text))]
    entropy = -sum([p * math.log(p) / math.log(2.0) for p in prob if p > 0])
    return entropy


def is_valid_text(text: str, min_length: int = 30, min_entropy: float = 3.5, max_entropy: float = 6.5) -> tuple[bool, str]:
    """
    检查文本是否有效(基于长度和熵值)
    
    Args:
        text: 文本内容
        min_length: 最小长度
        min_entropy: 最小熵值
        max_entropy: 最大熵值
        
    Returns:
        (是否有效, 原因)
    """
    if len(text) < min_length:
        return False, "Too Short"
    
    entropy = calculate_shannon_entropy(text)
    
    if entropy < min_entropy:
        return False, f"Low Entropy ({entropy:.2f}) - Likely menu/nav/ad"
    if entropy > max_entropy:
        return False, f"High Entropy ({entropy:.2f}) - Likely code/hash"
    
    return True, str(entropy)


def content_hash(text: str) -> str:
    """
    计算内容的MD5哈希值(用于去重)
    
    Args:
        text: 文本内容
        
    Returns:
        MD5哈希值(十六进制字符串)
    """
    return hashlib.md5(text.encode('utf-8')).hexdigest()


def absolute_url(base_url: str, href: str) -> Optional[str]:
    """
    将相对URL转换为绝对URL
    
    Args:
        base_url: 基础URL
        href: 相对或绝对URL
        
    Returns:
        绝对URL,如果无效则返回None
    """
    if not href:
        return None
    
    try:
        absolute = urljoin(base_url, href)
        return normalize_url(absolute)
    except Exception:
        return None


def get_domain(url: str) -> Optional[str]:
    """
    提取URL的域名
    
    Args:
        url: URL字符串
        
    Returns:
        域名,如果无效则返回None
    """
    try:
        parsed = urlparse(url)
        return parsed.netloc
    except Exception:
        return None


def filter_static_extensions(url: str, exclude_extensions: Optional[Set[str]] = None) -> bool:
    """
    检查URL是否为静态资源
    
    Args:
        url: URL字符串
        exclude_extensions: 要排除的扩展名集合
        
    Returns:
        如果是静态资源返回True,否则返回False
    """
    if exclude_extensions is None:
        exclude_extensions = {
            '.pdf', '.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp',
            '.css', '.js', '.ico', '.zip', '.tar', '.gz', '.rar',
            '.mp4', '.mp3', '.avi', '.mov', '.wmv', '.flv',
            '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx'
        }
    
    url_lower = url.lower()
    for ext in exclude_extensions:
        if url_lower.endswith(ext):
            return True
    
    # 检查静态路径模式
    static_patterns = ['/static/', '/assets/', '/media/', '/images/', '/css/', '/js/']
    for pattern in static_patterns:
        if pattern in url_lower:
            return True
    
    return False