|
|
from fastapi import FastAPI, UploadFile, File, Form, HTTPException |
|
|
from fastapi.responses import JSONResponse |
|
|
from fastapi.middleware.cors import CORSMiddleware |
|
|
from pydantic import BaseModel |
|
|
from typing import Optional, List, Dict |
|
|
from PIL import Image |
|
|
import io |
|
|
import numpy as np |
|
|
import os |
|
|
from datetime import datetime |
|
|
from pymongo import MongoClient |
|
|
from huggingface_hub import InferenceClient |
|
|
|
|
|
from embedding_service import JinaClipEmbeddingService |
|
|
from qdrant_service import QdrantVectorService |
|
|
from advanced_rag import AdvancedRAG |
|
|
from pdf_parser import PDFIndexer |
|
|
from multimodal_pdf_parser import MultimodalPDFIndexer |
|
|
|
|
|
|
|
|
app = FastAPI( |
|
|
title="Event Social Media Embeddings & ChatbotRAG API", |
|
|
description="API để embeddings, search và ChatbotRAG với Jina CLIP v2 + Qdrant + MongoDB + LLM", |
|
|
version="2.0.0" |
|
|
) |
|
|
|
|
|
|
|
|
app.add_middleware( |
|
|
CORSMiddleware, |
|
|
allow_origins=["*"], |
|
|
allow_credentials=True, |
|
|
allow_methods=["*"], |
|
|
allow_headers=["*"], |
|
|
) |
|
|
|
|
|
|
|
|
print("Initializing services...") |
|
|
embedding_service = JinaClipEmbeddingService(model_path="jinaai/jina-clip-v2") |
|
|
|
|
|
collection_name = os.getenv("COLLECTION_NAME", "event_social_media") |
|
|
qdrant_service = QdrantVectorService( |
|
|
collection_name=collection_name, |
|
|
vector_size=embedding_service.get_embedding_dimension() |
|
|
) |
|
|
print(f"✓ Qdrant collection: {collection_name}") |
|
|
|
|
|
|
|
|
mongodb_uri = os.getenv("MONGODB_URI", "mongodb+srv://truongtn7122003:[email protected]/") |
|
|
mongo_client = MongoClient(mongodb_uri) |
|
|
db = mongo_client[os.getenv("MONGODB_DB_NAME", "chatbot_rag")] |
|
|
documents_collection = db["documents"] |
|
|
chat_history_collection = db["chat_history"] |
|
|
print("✓ MongoDB connected") |
|
|
|
|
|
|
|
|
hf_token = os.getenv("HUGGINGFACE_TOKEN") |
|
|
if hf_token: |
|
|
print("✓ Hugging Face token configured") |
|
|
|
|
|
|
|
|
advanced_rag = AdvancedRAG( |
|
|
embedding_service=embedding_service, |
|
|
qdrant_service=qdrant_service |
|
|
) |
|
|
print("✓ Advanced RAG pipeline initialized") |
|
|
|
|
|
|
|
|
pdf_indexer = PDFIndexer( |
|
|
embedding_service=embedding_service, |
|
|
qdrant_service=qdrant_service, |
|
|
documents_collection=documents_collection |
|
|
) |
|
|
print("✓ PDF Indexer initialized") |
|
|
|
|
|
|
|
|
multimodal_pdf_indexer = MultimodalPDFIndexer( |
|
|
embedding_service=embedding_service, |
|
|
qdrant_service=qdrant_service, |
|
|
documents_collection=documents_collection |
|
|
) |
|
|
print("✓ Multimodal PDF Indexer initialized") |
|
|
|
|
|
print("✓ Services initialized successfully") |
|
|
|
|
|
|
|
|
|
|
|
class SearchRequest(BaseModel): |
|
|
text: Optional[str] = None |
|
|
limit: int = 10 |
|
|
score_threshold: Optional[float] = None |
|
|
text_weight: float = 0.5 |
|
|
image_weight: float = 0.5 |
|
|
|
|
|
|
|
|
class SearchResponse(BaseModel): |
|
|
id: str |
|
|
confidence: float |
|
|
metadata: dict |
|
|
|
|
|
|
|
|
class IndexResponse(BaseModel): |
|
|
success: bool |
|
|
id: str |
|
|
message: str |
|
|
|
|
|
|
|
|
|
|
|
class ChatRequest(BaseModel): |
|
|
message: str |
|
|
use_rag: bool = True |
|
|
top_k: int = 3 |
|
|
system_message: Optional[str] = "You are a helpful AI assistant." |
|
|
max_tokens: int = 512 |
|
|
temperature: float = 0.7 |
|
|
top_p: float = 0.95 |
|
|
hf_token: Optional[str] = None |
|
|
|
|
|
use_advanced_rag: bool = True |
|
|
use_query_expansion: bool = True |
|
|
use_reranking: bool = True |
|
|
use_compression: bool = True |
|
|
score_threshold: float = 0.5 |
|
|
|
|
|
|
|
|
class ChatResponse(BaseModel): |
|
|
response: str |
|
|
context_used: List[Dict] |
|
|
timestamp: str |
|
|
rag_stats: Optional[Dict] = None |
|
|
|
|
|
|
|
|
class AddDocumentRequest(BaseModel): |
|
|
text: str |
|
|
metadata: Optional[Dict] = None |
|
|
|
|
|
|
|
|
class AddDocumentResponse(BaseModel): |
|
|
success: bool |
|
|
doc_id: str |
|
|
message: str |
|
|
|
|
|
|
|
|
class UploadPDFResponse(BaseModel): |
|
|
success: bool |
|
|
document_id: str |
|
|
filename: str |
|
|
chunks_indexed: int |
|
|
message: str |
|
|
|
|
|
|
|
|
@app.get("/") |
|
|
async def root(): |
|
|
"""Health check endpoint with comprehensive API documentation""" |
|
|
return { |
|
|
"status": "running", |
|
|
"service": "ChatbotRAG API - Advanced RAG with Multimodal Support", |
|
|
"version": "3.0.0", |
|
|
"vector_db": "Qdrant", |
|
|
"document_db": "MongoDB", |
|
|
"features": { |
|
|
"multiple_inputs": "Index up to 10 texts + 10 images per request", |
|
|
"advanced_rag": "Query expansion, reranking, contextual compression", |
|
|
"pdf_support": "Upload PDFs and chat about their content", |
|
|
"multimodal_pdf": "PDFs with text and image URLs - perfect for user guides", |
|
|
"chat_history": "Track conversation history", |
|
|
"hybrid_search": "Text + image search with Jina CLIP v2" |
|
|
}, |
|
|
"endpoints": { |
|
|
"indexing": { |
|
|
"POST /index": { |
|
|
"description": "Index multiple texts and images (NEW: up to 10 each)", |
|
|
"content_type": "multipart/form-data", |
|
|
"body": { |
|
|
"id": "string (required) - Document ID (primary)", |
|
|
"texts": "List[string] (optional) - Up to 10 texts", |
|
|
"images": "List[UploadFile] (optional) - Up to 10 images", |
|
|
"id_use": "string (optional) - ID của SocialMedia hoặc EventCode", |
|
|
"id_user": "string (optional) - ID của User" |
|
|
}, |
|
|
"example": "curl -X POST '/index' -F 'id=doc1' -F 'id_use=social_123' -F 'id_user=user_789' -F 'texts=Text 1' -F '[email protected]'", |
|
|
"response": { |
|
|
"success": True, |
|
|
"id": "doc1", |
|
|
"message": "Indexed successfully with 2 texts and 1 images" |
|
|
}, |
|
|
"use_cases": { |
|
|
"social_media_post": { |
|
|
"id": "post_uuid_123", |
|
|
"id_use": "social_media_456", |
|
|
"id_user": "user_789", |
|
|
"description": "Link post to social media account and user" |
|
|
}, |
|
|
"event_post": { |
|
|
"id": "post_uuid_789", |
|
|
"id_use": "event_code_ABC123", |
|
|
"id_user": "user_101", |
|
|
"description": "Link post to event and user" |
|
|
} |
|
|
} |
|
|
}, |
|
|
"POST /documents": { |
|
|
"description": "Add text document to knowledge base", |
|
|
"content_type": "application/json", |
|
|
"body": { |
|
|
"text": "string (required) - Document content", |
|
|
"metadata": "object (optional) - Additional metadata" |
|
|
}, |
|
|
"example": { |
|
|
"text": "How to create event: Click 'Create Event' button...", |
|
|
"metadata": {"category": "tutorial", "source": "user_guide"} |
|
|
} |
|
|
}, |
|
|
"POST /upload-pdf": { |
|
|
"description": "Upload PDF file (text only)", |
|
|
"content_type": "multipart/form-data", |
|
|
"body": { |
|
|
"file": "UploadFile (required) - PDF file", |
|
|
"title": "string (optional) - Document title", |
|
|
"category": "string (optional) - Category", |
|
|
"description": "string (optional) - Description" |
|
|
}, |
|
|
"example": "curl -X POST '/upload-pdf' -F '[email protected]' -F 'title=User Guide'" |
|
|
}, |
|
|
"POST /upload-pdf-multimodal": { |
|
|
"description": "Upload PDF with text and image URLs (RECOMMENDED for user guides)", |
|
|
"content_type": "multipart/form-data", |
|
|
"features": [ |
|
|
"Extracts text from PDF", |
|
|
"Detects image URLs (http://, https://)", |
|
|
"Supports markdown: ", |
|
|
"Supports HTML: <img src='url'>", |
|
|
"Links images to text chunks", |
|
|
"Returns images with context in chat" |
|
|
], |
|
|
"body": { |
|
|
"file": "UploadFile (required) - PDF file with image URLs", |
|
|
"title": "string (optional) - Document title", |
|
|
"category": "string (optional) - e.g. 'user_guide', 'tutorial'", |
|
|
"description": "string (optional)" |
|
|
}, |
|
|
"example": "curl -X POST '/upload-pdf-multimodal' -F 'file=@guide_with_images.pdf' -F 'category=user_guide'", |
|
|
"response": { |
|
|
"success": True, |
|
|
"document_id": "pdf_multimodal_20251029_150000", |
|
|
"chunks_indexed": 25, |
|
|
"message": "PDF indexed with 25 chunks and 15 images" |
|
|
}, |
|
|
"use_case": "Perfect for user guides with screenshots, tutorials with diagrams" |
|
|
} |
|
|
}, |
|
|
"search": { |
|
|
"POST /search": { |
|
|
"description": "Hybrid search with text and/or image", |
|
|
"body": { |
|
|
"text": "string (optional) - Query text", |
|
|
"image": "UploadFile (optional) - Query image", |
|
|
"limit": "int (default: 10)", |
|
|
"score_threshold": "float (optional, 0-1)", |
|
|
"text_weight": "float (default: 0.5)", |
|
|
"image_weight": "float (default: 0.5)" |
|
|
} |
|
|
}, |
|
|
"POST /search/text": { |
|
|
"description": "Text-only search", |
|
|
"body": {"text": "string", "limit": "int", "score_threshold": "float"} |
|
|
}, |
|
|
"POST /search/image": { |
|
|
"description": "Image-only search", |
|
|
"body": {"image": "UploadFile", "limit": "int", "score_threshold": "float"} |
|
|
}, |
|
|
"POST /rag/search": { |
|
|
"description": "Search in RAG knowledge base", |
|
|
"body": {"query": "string", "top_k": "int (default: 5)", "score_threshold": "float (default: 0.5)"} |
|
|
} |
|
|
}, |
|
|
"chat": { |
|
|
"POST /chat": { |
|
|
"description": "Chat với Advanced RAG (Query expansion + Reranking + Compression)", |
|
|
"content_type": "application/json", |
|
|
"body": { |
|
|
"message": "string (required) - User question", |
|
|
"use_rag": "bool (default: true) - Enable RAG retrieval", |
|
|
"use_advanced_rag": "bool (default: true) - Use advanced RAG pipeline (RECOMMENDED)", |
|
|
"use_query_expansion": "bool (default: true) - Expand query with variations", |
|
|
"use_reranking": "bool (default: true) - Rerank results for accuracy", |
|
|
"use_compression": "bool (default: true) - Compress context to relevant parts", |
|
|
"top_k": "int (default: 3) - Number of documents to retrieve", |
|
|
"score_threshold": "float (default: 0.5) - Min relevance score (0-1)", |
|
|
"max_tokens": "int (default: 512) - Max response tokens", |
|
|
"temperature": "float (default: 0.7) - Creativity (0-1)", |
|
|
"hf_token": "string (optional) - Hugging Face token" |
|
|
}, |
|
|
"response": { |
|
|
"response": "string - AI answer", |
|
|
"context_used": "array - Retrieved documents with metadata", |
|
|
"timestamp": "string", |
|
|
"rag_stats": "object - RAG pipeline statistics (query variants, retrieval counts)" |
|
|
}, |
|
|
"example_advanced": { |
|
|
"message": "Làm sao để upload PDF có hình ảnh?", |
|
|
"use_advanced_rag": True, |
|
|
"use_reranking": True, |
|
|
"top_k": 5, |
|
|
"score_threshold": 0.5 |
|
|
}, |
|
|
"example_response_with_images": { |
|
|
"response": "Để upload PDF có hình ảnh, sử dụng endpoint /upload-pdf-multimodal...", |
|
|
"context_used": [ |
|
|
{ |
|
|
"id": "pdf_multimodal_...._p2_c1", |
|
|
"confidence": 0.89, |
|
|
"metadata": { |
|
|
"text": "Bước 1: Chuẩn bị PDF với image URLs...", |
|
|
"has_images": True, |
|
|
"image_urls": [ |
|
|
"https://example.com/screenshot1.png", |
|
|
"https://example.com/diagram.jpg" |
|
|
], |
|
|
"num_images": 2, |
|
|
"page": 2 |
|
|
} |
|
|
} |
|
|
], |
|
|
"rag_stats": { |
|
|
"original_query": "Làm sao để upload PDF có hình ảnh?", |
|
|
"expanded_queries": ["upload PDF hình ảnh", "PDF có ảnh"], |
|
|
"initial_results": 10, |
|
|
"after_rerank": 5, |
|
|
"after_compression": 5 |
|
|
} |
|
|
}, |
|
|
"notes": [ |
|
|
"Advanced RAG significantly improves answer quality", |
|
|
"When multimodal PDF is used, images are returned in metadata", |
|
|
"Requires HUGGINGFACE_TOKEN for actual LLM generation" |
|
|
] |
|
|
}, |
|
|
"GET /history": { |
|
|
"description": "Get chat history", |
|
|
"query_params": {"limit": "int (default: 10)", "skip": "int (default: 0)"}, |
|
|
"response": {"history": "array", "total": "int"} |
|
|
} |
|
|
}, |
|
|
"management": { |
|
|
"GET /documents/pdf": { |
|
|
"description": "List all PDF documents", |
|
|
"response": {"documents": "array", "total": "int"} |
|
|
}, |
|
|
"DELETE /documents/pdf/{document_id}": { |
|
|
"description": "Delete PDF and all its chunks", |
|
|
"response": {"success": "bool", "message": "string"} |
|
|
}, |
|
|
"GET /document/{doc_id}": { |
|
|
"description": "Get document by ID", |
|
|
"response": {"success": "bool", "data": "object"} |
|
|
}, |
|
|
"DELETE /delete/{doc_id}": { |
|
|
"description": "Delete document by ID", |
|
|
"response": {"success": "bool", "message": "string"} |
|
|
}, |
|
|
"GET /stats": { |
|
|
"description": "Get Qdrant collection statistics", |
|
|
"response": {"vectors_count": "int", "segments": "int", "indexed_vectors_count": "int"} |
|
|
} |
|
|
} |
|
|
}, |
|
|
"quick_start": { |
|
|
"1_upload_multimodal_pdf": "curl -X POST '/upload-pdf-multimodal' -F 'file=@user_guide.pdf' -F 'title=Guide'", |
|
|
"2_verify_upload": "curl '/documents/pdf'", |
|
|
"3_chat_with_rag": "curl -X POST '/chat' -H 'Content-Type: application/json' -d '{\"message\": \"How to...?\", \"use_advanced_rag\": true}'", |
|
|
"4_see_images_in_context": "response['context_used'][0]['metadata']['image_urls']" |
|
|
}, |
|
|
"use_cases": { |
|
|
"user_guide_with_screenshots": { |
|
|
"endpoint": "/upload-pdf-multimodal", |
|
|
"description": "PDFs with text instructions + image URLs for visual guidance", |
|
|
"benefits": ["Images linked to text chunks", "Chatbot returns relevant screenshots", "Perfect for step-by-step guides"] |
|
|
}, |
|
|
"simple_text_docs": { |
|
|
"endpoint": "/upload-pdf", |
|
|
"description": "Simple PDFs with text only (FAQ, policies, etc.)" |
|
|
}, |
|
|
"social_media_posts": { |
|
|
"endpoint": "/index", |
|
|
"description": "Index multiple posts with texts (up to 10) and images (up to 10)" |
|
|
}, |
|
|
"complex_queries": { |
|
|
"endpoint": "/chat", |
|
|
"description": "Use advanced RAG for better accuracy on complex questions", |
|
|
"settings": {"use_advanced_rag": True, "use_reranking": True, "use_compression": True} |
|
|
} |
|
|
}, |
|
|
"best_practices": { |
|
|
"pdf_format": [ |
|
|
"Include image URLs in text (http://, https://)", |
|
|
"Use markdown format:  or HTML: <img src='url'>", |
|
|
"Clear structure with headings and sections", |
|
|
"Link images close to their related text" |
|
|
], |
|
|
"chat_settings": { |
|
|
"for_accuracy": {"temperature": 0.3, "use_advanced_rag": True, "use_reranking": True}, |
|
|
"for_creativity": {"temperature": 0.8, "use_advanced_rag": False}, |
|
|
"for_factual_answers": {"temperature": 0.3, "use_compression": True, "score_threshold": 0.6} |
|
|
}, |
|
|
"retrieval_tuning": { |
|
|
"not_finding_info": "Lower score_threshold to 0.3-0.4, increase top_k to 7-10", |
|
|
"too_much_context": "Increase score_threshold to 0.6-0.7, decrease top_k to 3-5", |
|
|
"slow_responses": "Disable compression, use basic RAG, decrease top_k" |
|
|
} |
|
|
}, |
|
|
"links": { |
|
|
"docs": "http://localhost:8000/docs", |
|
|
"redoc": "http://localhost:8000/redoc", |
|
|
"openapi": "http://localhost:8000/openapi.json", |
|
|
"guides": { |
|
|
"multimodal_pdf": "See MULTIMODAL_PDF_GUIDE.md", |
|
|
"advanced_rag": "See ADVANCED_RAG_GUIDE.md", |
|
|
"pdf_general": "See PDF_RAG_GUIDE.md", |
|
|
"quick_start": "See QUICK_START_PDF.md" |
|
|
} |
|
|
}, |
|
|
"system_info": { |
|
|
"embedding_model": "Jina CLIP v2 (multimodal)", |
|
|
"vector_db": "Qdrant with HNSW index", |
|
|
"document_db": "MongoDB", |
|
|
"rag_pipeline": "Advanced RAG with query expansion, reranking, compression", |
|
|
"pdf_parser": "pypdfium2 with URL extraction", |
|
|
"max_inputs": "10 texts + 10 images per /index request" |
|
|
} |
|
|
} |
|
|
|
|
|
@app.post("/index", response_model=IndexResponse) |
|
|
async def index_data( |
|
|
id: str = Form(...), |
|
|
texts: Optional[List[str]] = Form(None), |
|
|
images: Optional[List[UploadFile]] = File(None), |
|
|
id_use: Optional[str] = Form(None), |
|
|
id_user: Optional[str] = Form(None) |
|
|
): |
|
|
""" |
|
|
Index data vào vector database (hỗ trợ nhiều texts và images) |
|
|
|
|
|
Body: |
|
|
- id: Document ID (primary ID) |
|
|
- texts: List of text contents (tiếng Việt supported) - Tối đa 10 texts |
|
|
- images: List of image files (optional) - Tối đa 10 images |
|
|
- id_use: ID của SocialMedia hoặc EventCode (optional) |
|
|
- id_user: ID của User (optional) |
|
|
|
|
|
Returns: |
|
|
- success: True/False |
|
|
- id: Document ID |
|
|
- message: Status message |
|
|
|
|
|
Example: |
|
|
```bash |
|
|
curl -X POST '/index' \ |
|
|
-F 'id=doc123' \ |
|
|
-F 'id_use=social_media_456' \ |
|
|
-F 'id_user=user_789' \ |
|
|
-F 'texts=Post content 1' \ |
|
|
-F 'texts=Post content 2' \ |
|
|
-F '[email protected]' |
|
|
``` |
|
|
""" |
|
|
try: |
|
|
|
|
|
if texts is None and images is None: |
|
|
raise HTTPException(status_code=400, detail="Phải cung cấp ít nhất texts hoặc images") |
|
|
|
|
|
if texts and len(texts) > 10: |
|
|
raise HTTPException(status_code=400, detail="Tối đa 10 texts") |
|
|
|
|
|
if images and len(images) > 10: |
|
|
raise HTTPException(status_code=400, detail="Tối đa 10 images") |
|
|
|
|
|
|
|
|
text_embeddings = [] |
|
|
image_embeddings = [] |
|
|
|
|
|
|
|
|
if texts: |
|
|
for text in texts: |
|
|
if text and text.strip(): |
|
|
text_emb = embedding_service.encode_text(text) |
|
|
text_embeddings.append(text_emb) |
|
|
|
|
|
|
|
|
if images: |
|
|
for image in images: |
|
|
if image.filename: |
|
|
image_bytes = await image.read() |
|
|
pil_image = Image.open(io.BytesIO(image_bytes)).convert('RGB') |
|
|
image_emb = embedding_service.encode_image(pil_image) |
|
|
image_embeddings.append(image_emb) |
|
|
|
|
|
|
|
|
all_embeddings = [] |
|
|
|
|
|
if text_embeddings: |
|
|
|
|
|
avg_text_embedding = np.mean(text_embeddings, axis=0) |
|
|
all_embeddings.append(avg_text_embedding) |
|
|
|
|
|
if image_embeddings: |
|
|
|
|
|
avg_image_embedding = np.mean(image_embeddings, axis=0) |
|
|
all_embeddings.append(avg_image_embedding) |
|
|
|
|
|
if not all_embeddings: |
|
|
raise HTTPException(status_code=400, detail="Không có embedding nào được tạo từ texts hoặc images") |
|
|
|
|
|
|
|
|
combined_embedding = np.mean(all_embeddings, axis=0) |
|
|
|
|
|
|
|
|
combined_embedding = combined_embedding / np.linalg.norm(combined_embedding, axis=1, keepdims=True) |
|
|
|
|
|
|
|
|
metadata = { |
|
|
"texts": texts if texts else [], |
|
|
"text_count": len(texts) if texts else 0, |
|
|
"image_count": len(images) if images else 0, |
|
|
"image_filenames": [img.filename for img in images] if images else [], |
|
|
"id_use": id_use if id_use else None, |
|
|
"id_user": id_user if id_user else None |
|
|
} |
|
|
|
|
|
result = qdrant_service.index_data( |
|
|
doc_id=id, |
|
|
embedding=combined_embedding, |
|
|
metadata=metadata |
|
|
) |
|
|
|
|
|
return IndexResponse( |
|
|
success=True, |
|
|
id=result["original_id"], |
|
|
message=f"Đã index thành công document {result['original_id']} với {len(texts) if texts else 0} texts và {len(images) if images else 0} images (Qdrant UUID: {result['qdrant_id']})" |
|
|
) |
|
|
|
|
|
except HTTPException: |
|
|
raise |
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Lỗi khi index: {str(e)}") |
|
|
|
|
|
|
|
|
@app.post("/search", response_model=List[SearchResponse]) |
|
|
async def search( |
|
|
text: Optional[str] = Form(None), |
|
|
image: Optional[UploadFile] = File(None), |
|
|
limit: int = Form(10), |
|
|
score_threshold: Optional[float] = Form(None), |
|
|
text_weight: float = Form(0.5), |
|
|
image_weight: float = Form(0.5) |
|
|
): |
|
|
""" |
|
|
Search similar documents bằng text và/hoặc image |
|
|
|
|
|
Body: |
|
|
- text: Query text (tiếng Việt supported) |
|
|
- image: Query image (optional) |
|
|
- limit: Số lượng kết quả (default: 10) |
|
|
- score_threshold: Minimum confidence score (0-1) |
|
|
- text_weight: Weight cho text search (default: 0.5) |
|
|
- image_weight: Weight cho image search (default: 0.5) |
|
|
|
|
|
Returns: |
|
|
- List of results với id, confidence, và metadata |
|
|
""" |
|
|
try: |
|
|
|
|
|
text_embedding = None |
|
|
image_embedding = None |
|
|
|
|
|
|
|
|
if text and text.strip(): |
|
|
text_embedding = embedding_service.encode_text(text) |
|
|
|
|
|
|
|
|
if image: |
|
|
image_bytes = await image.read() |
|
|
pil_image = Image.open(io.BytesIO(image_bytes)).convert('RGB') |
|
|
image_embedding = embedding_service.encode_image(pil_image) |
|
|
|
|
|
|
|
|
if text_embedding is None and image_embedding is None: |
|
|
raise HTTPException(status_code=400, detail="Phải cung cấp ít nhất text hoặc image để search") |
|
|
|
|
|
|
|
|
results = qdrant_service.hybrid_search( |
|
|
text_embedding=text_embedding, |
|
|
image_embedding=image_embedding, |
|
|
text_weight=text_weight, |
|
|
image_weight=image_weight, |
|
|
limit=limit, |
|
|
score_threshold=score_threshold, |
|
|
ef=256 |
|
|
) |
|
|
|
|
|
|
|
|
return [ |
|
|
SearchResponse( |
|
|
id=result["id"], |
|
|
confidence=result["confidence"], |
|
|
metadata=result["metadata"] |
|
|
) |
|
|
for result in results |
|
|
] |
|
|
|
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Lỗi khi search: {str(e)}") |
|
|
|
|
|
|
|
|
@app.post("/search/text", response_model=List[SearchResponse]) |
|
|
async def search_by_text( |
|
|
text: str = Form(...), |
|
|
limit: int = Form(10), |
|
|
score_threshold: Optional[float] = Form(None) |
|
|
): |
|
|
""" |
|
|
Search chỉ bằng text (tiếng Việt) |
|
|
|
|
|
Body: |
|
|
- text: Query text (tiếng Việt) |
|
|
- limit: Số lượng kết quả |
|
|
- score_threshold: Minimum confidence score |
|
|
|
|
|
Returns: |
|
|
- List of results |
|
|
""" |
|
|
try: |
|
|
|
|
|
text_embedding = embedding_service.encode_text(text) |
|
|
|
|
|
|
|
|
results = qdrant_service.search( |
|
|
query_embedding=text_embedding, |
|
|
limit=limit, |
|
|
score_threshold=score_threshold, |
|
|
ef=256 |
|
|
) |
|
|
|
|
|
return [ |
|
|
SearchResponse( |
|
|
id=result["id"], |
|
|
confidence=result["confidence"], |
|
|
metadata=result["metadata"] |
|
|
) |
|
|
for result in results |
|
|
] |
|
|
|
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Lỗi khi search: {str(e)}") |
|
|
|
|
|
|
|
|
@app.post("/search/image", response_model=List[SearchResponse]) |
|
|
async def search_by_image( |
|
|
image: UploadFile = File(...), |
|
|
limit: int = Form(10), |
|
|
score_threshold: Optional[float] = Form(None) |
|
|
): |
|
|
""" |
|
|
Search chỉ bằng image |
|
|
|
|
|
Body: |
|
|
- image: Query image |
|
|
- limit: Số lượng kết quả |
|
|
- score_threshold: Minimum confidence score |
|
|
|
|
|
Returns: |
|
|
- List of results |
|
|
""" |
|
|
try: |
|
|
|
|
|
image_bytes = await image.read() |
|
|
pil_image = Image.open(io.BytesIO(image_bytes)).convert('RGB') |
|
|
image_embedding = embedding_service.encode_image(pil_image) |
|
|
|
|
|
|
|
|
results = qdrant_service.search( |
|
|
query_embedding=image_embedding, |
|
|
limit=limit, |
|
|
score_threshold=score_threshold, |
|
|
ef=256 |
|
|
) |
|
|
|
|
|
return [ |
|
|
SearchResponse( |
|
|
id=result["id"], |
|
|
confidence=result["confidence"], |
|
|
metadata=result["metadata"] |
|
|
) |
|
|
for result in results |
|
|
] |
|
|
|
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Lỗi khi search: {str(e)}") |
|
|
|
|
|
|
|
|
@app.delete("/delete/{doc_id}") |
|
|
async def delete_document(doc_id: str): |
|
|
""" |
|
|
Delete document by ID (MongoDB ObjectId hoặc UUID) |
|
|
|
|
|
Args: |
|
|
- doc_id: Document ID to delete |
|
|
|
|
|
Returns: |
|
|
- Success message |
|
|
""" |
|
|
try: |
|
|
qdrant_service.delete_by_id(doc_id) |
|
|
return {"success": True, "message": f"Đã xóa document {doc_id}"} |
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Lỗi khi xóa: {str(e)}") |
|
|
|
|
|
|
|
|
@app.get("/document/{doc_id}") |
|
|
async def get_document(doc_id: str): |
|
|
""" |
|
|
Get document by ID (MongoDB ObjectId hoặc UUID) |
|
|
|
|
|
Args: |
|
|
- doc_id: Document ID (MongoDB ObjectId) |
|
|
|
|
|
Returns: |
|
|
- Document data |
|
|
""" |
|
|
try: |
|
|
doc = qdrant_service.get_by_id(doc_id) |
|
|
if doc: |
|
|
return { |
|
|
"success": True, |
|
|
"data": doc |
|
|
} |
|
|
raise HTTPException(status_code=404, detail=f"Không tìm thấy document {doc_id}") |
|
|
except HTTPException: |
|
|
raise |
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Lỗi khi get document: {str(e)}") |
|
|
|
|
|
|
|
|
@app.get("/stats") |
|
|
async def get_stats(): |
|
|
""" |
|
|
Lấy thông tin thống kê collection |
|
|
|
|
|
Returns: |
|
|
- Collection statistics |
|
|
""" |
|
|
try: |
|
|
info = qdrant_service.get_collection_info() |
|
|
return info |
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Lỗi khi lấy stats: {str(e)}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.post("/chat", response_model=ChatResponse) |
|
|
async def chat(request: ChatRequest): |
|
|
""" |
|
|
Chat endpoint với Advanced RAG |
|
|
|
|
|
Body: |
|
|
- message: User message |
|
|
- use_rag: Enable RAG retrieval (default: true) |
|
|
- top_k: Number of documents to retrieve (default: 3) |
|
|
- system_message: System prompt (optional) |
|
|
- max_tokens: Max tokens for response (default: 512) |
|
|
- temperature: Temperature for generation (default: 0.7) |
|
|
- hf_token: Hugging Face token (optional, sẽ dùng env nếu không truyền) |
|
|
- use_advanced_rag: Use advanced RAG pipeline (default: true) |
|
|
- use_query_expansion: Enable query expansion (default: true) |
|
|
- use_reranking: Enable reranking (default: true) |
|
|
- use_compression: Enable context compression (default: true) |
|
|
- score_threshold: Minimum relevance score (default: 0.5) |
|
|
|
|
|
Returns: |
|
|
- response: Generated response |
|
|
- context_used: Retrieved context documents |
|
|
- timestamp: Response timestamp |
|
|
- rag_stats: Statistics from RAG pipeline |
|
|
""" |
|
|
try: |
|
|
|
|
|
context_used = [] |
|
|
rag_stats = None |
|
|
|
|
|
if request.use_rag: |
|
|
if request.use_advanced_rag: |
|
|
|
|
|
documents, stats = advanced_rag.hybrid_rag_pipeline( |
|
|
query=request.message, |
|
|
top_k=request.top_k, |
|
|
score_threshold=request.score_threshold, |
|
|
use_reranking=request.use_reranking, |
|
|
use_compression=request.use_compression, |
|
|
max_context_tokens=500 |
|
|
) |
|
|
|
|
|
|
|
|
context_used = [ |
|
|
{ |
|
|
"id": doc.id, |
|
|
"confidence": doc.confidence, |
|
|
"metadata": doc.metadata |
|
|
} |
|
|
for doc in documents |
|
|
] |
|
|
rag_stats = stats |
|
|
|
|
|
|
|
|
context_text = advanced_rag.format_context_for_llm(documents) |
|
|
|
|
|
else: |
|
|
|
|
|
query_embedding = embedding_service.encode_text(request.message) |
|
|
|
|
|
results = qdrant_service.search( |
|
|
query_embedding=query_embedding, |
|
|
limit=request.top_k, |
|
|
score_threshold=request.score_threshold |
|
|
) |
|
|
context_used = results |
|
|
|
|
|
|
|
|
context_text = "\n\nRelevant Context:\n" |
|
|
for i, doc in enumerate(context_used, 1): |
|
|
doc_text = doc["metadata"].get("text", "") |
|
|
confidence = doc["confidence"] |
|
|
context_text += f"\n[{i}] (Confidence: {confidence:.2f})\n{doc_text}\n" |
|
|
|
|
|
|
|
|
if request.use_rag and context_used: |
|
|
if request.use_advanced_rag: |
|
|
|
|
|
system_message = advanced_rag.build_rag_prompt( |
|
|
query=request.message, |
|
|
context=context_text, |
|
|
system_message=request.system_message |
|
|
) |
|
|
else: |
|
|
|
|
|
system_message = f"{request.system_message}\n{context_text}\n\nPlease use the above context to answer the user's question when relevant." |
|
|
else: |
|
|
system_message = request.system_message |
|
|
|
|
|
|
|
|
token = request.hf_token or hf_token |
|
|
|
|
|
if not token: |
|
|
response = f"""[LLM Response Placeholder] |
|
|
|
|
|
Context retrieved: {len(context_used)} documents |
|
|
User question: {request.message} |
|
|
|
|
|
To enable actual LLM generation: |
|
|
1. Set HUGGINGFACE_TOKEN environment variable, OR |
|
|
2. Pass hf_token in request body |
|
|
|
|
|
Example: |
|
|
{{ |
|
|
"message": "Your question", |
|
|
"hf_token": "hf_xxxxxxxxxxxxx" |
|
|
}} |
|
|
""" |
|
|
else: |
|
|
try: |
|
|
client = InferenceClient( |
|
|
token=hf_token, |
|
|
model="openai/gpt-oss-20b" |
|
|
) |
|
|
|
|
|
|
|
|
messages = [ |
|
|
{"role": "system", "content": system_message}, |
|
|
{"role": "user", "content": request.message} |
|
|
] |
|
|
|
|
|
|
|
|
response = "" |
|
|
for msg in client.chat_completion( |
|
|
messages, |
|
|
max_tokens=request.max_tokens, |
|
|
stream=True, |
|
|
temperature=request.temperature, |
|
|
top_p=request.top_p, |
|
|
): |
|
|
choices = msg.choices |
|
|
if len(choices) and choices[0].delta.content: |
|
|
response += choices[0].delta.content |
|
|
|
|
|
except Exception as e: |
|
|
response = f"Error generating response with LLM: {str(e)}\n\nContext was retrieved successfully, but LLM generation failed." |
|
|
|
|
|
|
|
|
chat_data = { |
|
|
"user_message": request.message, |
|
|
"assistant_response": response, |
|
|
"context_used": context_used, |
|
|
"timestamp": datetime.utcnow() |
|
|
} |
|
|
chat_history_collection.insert_one(chat_data) |
|
|
|
|
|
return ChatResponse( |
|
|
response=response, |
|
|
context_used=context_used, |
|
|
timestamp=datetime.utcnow().isoformat(), |
|
|
rag_stats=rag_stats |
|
|
) |
|
|
|
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Error: {str(e)}") |
|
|
|
|
|
|
|
|
@app.post("/documents", response_model=AddDocumentResponse) |
|
|
async def add_document(request: AddDocumentRequest): |
|
|
""" |
|
|
Add document to knowledge base |
|
|
|
|
|
Body: |
|
|
- text: Document text |
|
|
- metadata: Additional metadata (optional) |
|
|
|
|
|
Returns: |
|
|
- success: True/False |
|
|
- doc_id: MongoDB document ID |
|
|
- message: Status message |
|
|
""" |
|
|
try: |
|
|
|
|
|
doc_data = { |
|
|
"text": request.text, |
|
|
"metadata": request.metadata or {}, |
|
|
"created_at": datetime.utcnow() |
|
|
} |
|
|
result = documents_collection.insert_one(doc_data) |
|
|
doc_id = str(result.inserted_id) |
|
|
|
|
|
|
|
|
embedding = embedding_service.encode_text(request.text) |
|
|
|
|
|
|
|
|
qdrant_service.index_data( |
|
|
doc_id=doc_id, |
|
|
embedding=embedding, |
|
|
metadata={ |
|
|
"text": request.text, |
|
|
"source": "api", |
|
|
**(request.metadata or {}) |
|
|
} |
|
|
) |
|
|
|
|
|
return AddDocumentResponse( |
|
|
success=True, |
|
|
doc_id=doc_id, |
|
|
message=f"Document added successfully with ID: {doc_id}" |
|
|
) |
|
|
|
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Error: {str(e)}") |
|
|
|
|
|
|
|
|
@app.post("/rag/search", response_model=List[SearchResponse]) |
|
|
async def rag_search( |
|
|
query: str = Form(...), |
|
|
top_k: int = Form(5), |
|
|
score_threshold: Optional[float] = Form(0.5) |
|
|
): |
|
|
""" |
|
|
Search in knowledge base |
|
|
|
|
|
Body: |
|
|
- query: Search query |
|
|
- top_k: Number of results (default: 5) |
|
|
- score_threshold: Minimum score (default: 0.5) |
|
|
|
|
|
Returns: |
|
|
- results: List of matching documents |
|
|
""" |
|
|
try: |
|
|
|
|
|
query_embedding = embedding_service.encode_text(query) |
|
|
|
|
|
|
|
|
results = qdrant_service.search( |
|
|
query_embedding=query_embedding, |
|
|
limit=top_k, |
|
|
score_threshold=score_threshold |
|
|
) |
|
|
|
|
|
return [ |
|
|
SearchResponse( |
|
|
id=result["id"], |
|
|
confidence=result["confidence"], |
|
|
metadata=result["metadata"] |
|
|
) |
|
|
for result in results |
|
|
] |
|
|
|
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Error: {str(e)}") |
|
|
|
|
|
|
|
|
@app.get("/history") |
|
|
async def get_history(limit: int = 10, skip: int = 0): |
|
|
""" |
|
|
Get chat history |
|
|
|
|
|
Query params: |
|
|
- limit: Number of messages to return (default: 10) |
|
|
- skip: Number of messages to skip (default: 0) |
|
|
|
|
|
Returns: |
|
|
- history: List of chat messages |
|
|
""" |
|
|
try: |
|
|
history = list( |
|
|
chat_history_collection |
|
|
.find({}, {"_id": 0}) |
|
|
.sort("timestamp", -1) |
|
|
.skip(skip) |
|
|
.limit(limit) |
|
|
) |
|
|
|
|
|
|
|
|
for msg in history: |
|
|
if "timestamp" in msg: |
|
|
msg["timestamp"] = msg["timestamp"].isoformat() |
|
|
|
|
|
return { |
|
|
"history": history, |
|
|
"total": chat_history_collection.count_documents({}) |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Error: {str(e)}") |
|
|
|
|
|
|
|
|
@app.delete("/documents/{doc_id}") |
|
|
async def delete_document_from_kb(doc_id: str): |
|
|
""" |
|
|
Delete document from knowledge base |
|
|
|
|
|
Args: |
|
|
- doc_id: Document ID (MongoDB ObjectId) |
|
|
|
|
|
Returns: |
|
|
- success: True/False |
|
|
- message: Status message |
|
|
""" |
|
|
try: |
|
|
|
|
|
result = documents_collection.delete_one({"_id": doc_id}) |
|
|
|
|
|
|
|
|
if result.deleted_count > 0: |
|
|
qdrant_service.delete_by_id(doc_id) |
|
|
return {"success": True, "message": f"Document {doc_id} deleted from knowledge base"} |
|
|
else: |
|
|
raise HTTPException(status_code=404, detail=f"Document {doc_id} not found") |
|
|
|
|
|
except HTTPException: |
|
|
raise |
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Error: {str(e)}") |
|
|
|
|
|
|
|
|
@app.post("/upload-pdf", response_model=UploadPDFResponse) |
|
|
async def upload_pdf( |
|
|
file: UploadFile = File(...), |
|
|
document_id: Optional[str] = Form(None), |
|
|
title: Optional[str] = Form(None), |
|
|
description: Optional[str] = Form(None), |
|
|
category: Optional[str] = Form(None) |
|
|
): |
|
|
""" |
|
|
Upload and index PDF file into knowledge base |
|
|
|
|
|
Body (multipart/form-data): |
|
|
- file: PDF file (required) |
|
|
- document_id: Custom document ID (optional, auto-generated if not provided) |
|
|
- title: Document title (optional) |
|
|
- description: Document description (optional) |
|
|
- category: Document category (optional, e.g., "user_guide", "faq") |
|
|
|
|
|
Returns: |
|
|
- success: True/False |
|
|
- document_id: Document ID |
|
|
- filename: Original filename |
|
|
- chunks_indexed: Number of chunks created |
|
|
- message: Status message |
|
|
|
|
|
Example: |
|
|
```bash |
|
|
curl -X POST "http://localhost:8000/upload-pdf" \ |
|
|
-F "file=@user_guide.pdf" \ |
|
|
-F "title=Hướng dẫn sử dụng ChatbotRAG" \ |
|
|
-F "category=user_guide" |
|
|
``` |
|
|
""" |
|
|
try: |
|
|
|
|
|
if not file.filename.endswith('.pdf'): |
|
|
raise HTTPException(status_code=400, detail="Only PDF files are allowed") |
|
|
|
|
|
|
|
|
if not document_id: |
|
|
from datetime import datetime |
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
document_id = f"pdf_{timestamp}" |
|
|
|
|
|
|
|
|
pdf_bytes = await file.read() |
|
|
|
|
|
|
|
|
metadata = {} |
|
|
if title: |
|
|
metadata['title'] = title |
|
|
if description: |
|
|
metadata['description'] = description |
|
|
if category: |
|
|
metadata['category'] = category |
|
|
|
|
|
|
|
|
result = pdf_indexer.index_pdf_bytes( |
|
|
pdf_bytes=pdf_bytes, |
|
|
document_id=document_id, |
|
|
filename=file.filename, |
|
|
document_metadata=metadata |
|
|
) |
|
|
|
|
|
return UploadPDFResponse( |
|
|
success=True, |
|
|
document_id=result['document_id'], |
|
|
filename=result['filename'], |
|
|
chunks_indexed=result['chunks_indexed'], |
|
|
message=f"PDF '{file.filename}' đã được index thành công với {result['chunks_indexed']} chunks" |
|
|
) |
|
|
|
|
|
except HTTPException: |
|
|
raise |
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Error uploading PDF: {str(e)}") |
|
|
|
|
|
|
|
|
@app.get("/documents/pdf") |
|
|
async def list_pdf_documents(): |
|
|
""" |
|
|
List all PDF documents in knowledge base |
|
|
|
|
|
Returns: |
|
|
- documents: List of PDF documents with metadata |
|
|
""" |
|
|
try: |
|
|
docs = list(documents_collection.find( |
|
|
{"type": "pdf"}, |
|
|
{"_id": 0} |
|
|
)) |
|
|
return {"documents": docs, "total": len(docs)} |
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Error: {str(e)}") |
|
|
|
|
|
|
|
|
@app.delete("/documents/pdf/{document_id}") |
|
|
async def delete_pdf_document(document_id: str): |
|
|
""" |
|
|
Delete PDF document and all its chunks from knowledge base |
|
|
|
|
|
Args: |
|
|
- document_id: Document ID |
|
|
|
|
|
Returns: |
|
|
- success: True/False |
|
|
- message: Status message |
|
|
""" |
|
|
try: |
|
|
|
|
|
doc = documents_collection.find_one({"document_id": document_id, "type": "pdf"}) |
|
|
|
|
|
if not doc: |
|
|
raise HTTPException(status_code=404, detail=f"PDF document {document_id} not found") |
|
|
|
|
|
|
|
|
chunk_ids = doc.get('chunk_ids', []) |
|
|
for chunk_id in chunk_ids: |
|
|
try: |
|
|
qdrant_service.delete_by_id(chunk_id) |
|
|
except: |
|
|
pass |
|
|
|
|
|
|
|
|
documents_collection.delete_one({"document_id": document_id}) |
|
|
|
|
|
return { |
|
|
"success": True, |
|
|
"message": f"PDF document {document_id} and {len(chunk_ids)} chunks deleted" |
|
|
} |
|
|
|
|
|
except HTTPException: |
|
|
raise |
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Error: {str(e)}") |
|
|
|
|
|
|
|
|
@app.post("/upload-pdf-multimodal", response_model=UploadPDFResponse) |
|
|
async def upload_pdf_multimodal( |
|
|
file: UploadFile = File(...), |
|
|
document_id: Optional[str] = Form(None), |
|
|
title: Optional[str] = Form(None), |
|
|
description: Optional[str] = Form(None), |
|
|
category: Optional[str] = Form(None) |
|
|
): |
|
|
""" |
|
|
Upload PDF with text and image URLs (for user guides with screenshots) |
|
|
|
|
|
This endpoint is optimized for PDFs containing: |
|
|
- Text instructions |
|
|
- Image URLs (http://... or https://...) |
|
|
- Markdown images:  |
|
|
- HTML images: <img src="url"> |
|
|
|
|
|
The system will: |
|
|
1. Extract text from PDF |
|
|
2. Detect all image URLs in the text |
|
|
3. Link images to their corresponding text chunks |
|
|
4. Store image URLs in metadata |
|
|
5. Return images along with text during chat |
|
|
|
|
|
Body (multipart/form-data): |
|
|
- file: PDF file (required) |
|
|
- document_id: Custom document ID (optional, auto-generated if not provided) |
|
|
- title: Document title (optional) |
|
|
- description: Document description (optional) |
|
|
- category: Document category (optional, e.g., "user_guide", "tutorial") |
|
|
|
|
|
Returns: |
|
|
- success: True/False |
|
|
- document_id: Document ID |
|
|
- filename: Original filename |
|
|
- chunks_indexed: Number of chunks created |
|
|
- message: Status message (includes image count) |
|
|
|
|
|
Example: |
|
|
```bash |
|
|
curl -X POST "http://localhost:8000/upload-pdf-multimodal" \ |
|
|
-F "file=@user_guide_with_images.pdf" \ |
|
|
-F "title=Hướng dẫn có ảnh minh họa" \ |
|
|
-F "category=user_guide" |
|
|
``` |
|
|
|
|
|
Example Response: |
|
|
```json |
|
|
{ |
|
|
"success": true, |
|
|
"document_id": "pdf_20251029_150000", |
|
|
"filename": "user_guide_with_images.pdf", |
|
|
"chunks_indexed": 25, |
|
|
"message": "PDF 'user_guide_with_images.pdf' indexed with 25 chunks and 15 images" |
|
|
} |
|
|
``` |
|
|
""" |
|
|
try: |
|
|
|
|
|
if not file.filename.endswith('.pdf'): |
|
|
raise HTTPException(status_code=400, detail="Only PDF files are allowed") |
|
|
|
|
|
|
|
|
if not document_id: |
|
|
from datetime import datetime |
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
document_id = f"pdf_multimodal_{timestamp}" |
|
|
|
|
|
|
|
|
pdf_bytes = await file.read() |
|
|
|
|
|
|
|
|
metadata = {'type': 'multimodal'} |
|
|
if title: |
|
|
metadata['title'] = title |
|
|
if description: |
|
|
metadata['description'] = description |
|
|
if category: |
|
|
metadata['category'] = category |
|
|
|
|
|
|
|
|
result = multimodal_pdf_indexer.index_pdf_bytes( |
|
|
pdf_bytes=pdf_bytes, |
|
|
document_id=document_id, |
|
|
filename=file.filename, |
|
|
document_metadata=metadata |
|
|
) |
|
|
|
|
|
return UploadPDFResponse( |
|
|
success=True, |
|
|
document_id=result['document_id'], |
|
|
filename=result['filename'], |
|
|
chunks_indexed=result['chunks_indexed'], |
|
|
message=f"PDF '{file.filename}' indexed successfully with {result['chunks_indexed']} chunks and {result.get('images_found', 0)} images" |
|
|
) |
|
|
|
|
|
except HTTPException: |
|
|
raise |
|
|
except Exception as e: |
|
|
raise HTTPException(status_code=500, detail=f"Error uploading multimodal PDF: {str(e)}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import uvicorn |
|
|
uvicorn.run( |
|
|
app, |
|
|
host="0.0.0.0", |
|
|
port=8000, |
|
|
log_level="info" |
|
|
) |
|
|
|