Spaces:
Sleeping
Sleeping
File size: 2,338 Bytes
e9ea7c0 4133eb6 e9ea7c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
import os
from typing import List
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
class RAGEngine:
def __init__(self, index_path="faiss_index"):
self.index_path = index_path
self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
self.vector_store = None
self._load_index()
def _load_index(self):
if os.path.exists(self.index_path):
try:
self.vector_store = FAISS.load_local(self.index_path, self.embeddings, allow_dangerous_deserialization=True)
print("Loaded existing FAISS index.")
except Exception as e:
print(f"Failed to load index: {e}")
self.vector_store = None
else:
print("No existing FAISS index found.")
def ingest_file(self, file_path: str):
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
# Load document
if file_path.endswith(".pdf"):
loader = PyPDFLoader(file_path)
else:
loader = TextLoader(file_path)
documents = loader.load()
# Split text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)
# Create or update vector store
if self.vector_store is None:
self.vector_store = FAISS.from_documents(texts, self.embeddings)
else:
self.vector_store.add_documents(texts)
# Save index
self.vector_store.save_local(self.index_path)
print(f"Ingested {file_path} and updated index.")
def search(self, query: str, k: int = 3) -> List[str]:
if self.vector_store is None:
return []
docs = self.vector_store.similarity_search(query, k=k)
return [doc.page_content for doc in docs]
def clear_index(self):
if os.path.exists(self.index_path):
import shutil
shutil.rmtree(self.index_path)
self.vector_store = None
print("Index cleared.")
|