|
|
from setup import *
|
|
|
import tempfile
|
|
|
import requests
|
|
|
|
|
|
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
|
|
|
|
|
|
from urllib.parse import urlparse
|
|
|
from langchain.docstore.document import Document
|
|
|
|
|
|
|
|
|
|
|
|
def extract_urls(agentstate_result):
|
|
|
urls = []
|
|
|
content=[]
|
|
|
for item in agentstate_result['link_list']:
|
|
|
urls.append(item['url'])
|
|
|
content.append(item['content'])
|
|
|
|
|
|
return urls, content
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def classify_url_by_extension(url):
|
|
|
"""
|
|
|
Classifies a URL based on its file extension.
|
|
|
Focuses only on pdf and html, classifying others as unknown.
|
|
|
"""
|
|
|
|
|
|
if not isinstance(url, str):
|
|
|
raise ValueError(f"Expected a string, but got {type(url)}")
|
|
|
|
|
|
|
|
|
try:
|
|
|
file_extension = urlparse(url).path.split('.')[-1].lower()
|
|
|
if file_extension == 'pdf':
|
|
|
return 'pdf'
|
|
|
elif file_extension in ['html', 'htm']:
|
|
|
return 'html'
|
|
|
else:
|
|
|
return 'unknown'
|
|
|
except Exception as e:
|
|
|
print(f"Error while parsing URL: {url} - {e}")
|
|
|
return 'unknown'
|
|
|
|
|
|
|
|
|
|
|
|
def classify_url_by_header(url):
|
|
|
"""
|
|
|
Classifies a URL based on the HTTP Content-Type header.
|
|
|
Focuses only on pdf and html, classifying others as unknown.
|
|
|
"""
|
|
|
try:
|
|
|
response = requests.head(url, timeout=5)
|
|
|
content_type = response.headers.get('Content-Type', '').lower()
|
|
|
|
|
|
if 'pdf' in content_type:
|
|
|
return 'pdf'
|
|
|
elif 'html' in content_type:
|
|
|
return 'html'
|
|
|
else:
|
|
|
return 'unknown'
|
|
|
except requests.RequestException as e:
|
|
|
print(f"Error while making HEAD request: {url} - {e}")
|
|
|
return 'unknown'
|
|
|
|
|
|
|
|
|
|
|
|
def urls_classify_list(urls):
|
|
|
"""
|
|
|
Classifies a list of URLs into pdf, html, and unknown.
|
|
|
Returns two separate lists: one for pdf URLs and one for html URLs.
|
|
|
"""
|
|
|
if not isinstance(urls, list):
|
|
|
raise ValueError("Expected a list of URLs")
|
|
|
|
|
|
pdf_urls = []
|
|
|
html_urls = []
|
|
|
|
|
|
|
|
|
for url in urls:
|
|
|
file_type = classify_url_by_extension(url)
|
|
|
if file_type == 'unknown':
|
|
|
|
|
|
file_type = classify_url_by_header(url)
|
|
|
|
|
|
if file_type == 'pdf':
|
|
|
pdf_urls.append(url)
|
|
|
elif file_type == 'html':
|
|
|
html_urls.append(url)
|
|
|
|
|
|
return pdf_urls, html_urls
|
|
|
|
|
|
|
|
|
|
|
|
def urls_classify_list(urls: list):
|
|
|
pdf_urls=[]
|
|
|
html_urls=[]
|
|
|
|
|
|
for url in urls:
|
|
|
file_type = classify_url_by_extension(url)
|
|
|
if file_type == 'unknown':
|
|
|
|
|
|
file_type = classify_url_by_header(url)
|
|
|
|
|
|
if file_type == 'pdf':
|
|
|
pdf_urls.append(url)
|
|
|
|
|
|
if file_type == 'html' or file_type == 'unknown':
|
|
|
html_urls.append(url)
|
|
|
|
|
|
return pdf_urls, html_urls
|
|
|
|
|
|
|
|
|
|
|
|
def clean_and_extract_html_data(html_urls, chunk_size=100, chunk_overlap=25):
|
|
|
"""
|
|
|
Loads HTML content from URLs, cleans the data, and splits it into smaller chunks.
|
|
|
|
|
|
Args:
|
|
|
html_urls (list): List of HTML URLs to process.
|
|
|
chunk_size (int): Maximum size of each chunk.
|
|
|
chunk_overlap (int): Overlap between chunks.
|
|
|
|
|
|
Returns:
|
|
|
list: List of document chunks.
|
|
|
"""
|
|
|
|
|
|
def clean_content(content):
|
|
|
"""
|
|
|
Cleans the content by removing unwanted patterns and short lines.
|
|
|
"""
|
|
|
cleaned_content = content.strip()
|
|
|
lines = cleaned_content.split('\n')
|
|
|
meaningful_lines = [line.strip() for line in lines if len(line.strip()) > 3]
|
|
|
return '\n'.join(meaningful_lines)
|
|
|
|
|
|
def split_document(doc_content, chunk_size, chunk_overlap):
|
|
|
"""
|
|
|
Splits a document into smaller chunks with overlap.
|
|
|
"""
|
|
|
chunks = []
|
|
|
start = 0
|
|
|
while start < len(doc_content):
|
|
|
end = start + chunk_size
|
|
|
chunk = doc_content[start:end]
|
|
|
chunks.append(chunk)
|
|
|
start = end - chunk_overlap if end < len(doc_content) else len(doc_content)
|
|
|
return chunks
|
|
|
|
|
|
|
|
|
docs = []
|
|
|
for url in html_urls:
|
|
|
try:
|
|
|
loader = WebBaseLoader(url)
|
|
|
data = loader.load()
|
|
|
docs.extend(data)
|
|
|
except Exception as e:
|
|
|
print(f"Error loading URL {url}: {e}")
|
|
|
|
|
|
|
|
|
cleaned_docs = []
|
|
|
for doc in docs:
|
|
|
cleaned_content = clean_content(doc.page_content)
|
|
|
if cleaned_content:
|
|
|
doc.page_content = cleaned_content
|
|
|
cleaned_docs.append(doc)
|
|
|
|
|
|
|
|
|
doc_splits = []
|
|
|
for doc in cleaned_docs:
|
|
|
chunks = split_document(doc.page_content, chunk_size, chunk_overlap)
|
|
|
for chunk in chunks:
|
|
|
doc_splits.append(Document(page_content=chunk, metadata=doc.metadata))
|
|
|
|
|
|
return doc_splits
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|