Spaces:

joshi-deepak08
/

RAG_based_offline_chatbot

Sleeping

App Files Files Community

RAG_based_offline_chatbot / main.py

joshi-deepak08

Update main.py

14fc002 verified about 2 months ago

raw

history blame contribute delete

17.8 kB

	import os
	import hashlib
	import sqlite3
	from datetime import datetime, timezone
	from typing import List, Tuple
	from pathlib import Path
	import json
	import numpy as np

	from flask import Flask, request, jsonify, send_from_directory, render_template, url_for
	from werkzeug.utils import secure_filename

	# --- LLM / Transcription config ---
	# Read from env (don’t hardcode secrets)
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "").strip()

	# OpenAI client (lazy import only if key set)
	client = None
	if OPENAI_API_KEY:
	try:
	from openai import OpenAI
	client = OpenAI(api_key=OPENAI_API_KEY)
	except Exception as e:
	print("Warning: OpenAI client init failed:", e)
	client = None

	# Try offline transcription via faster-whisper if available
	FAST_WHISPER = None
	try:
	from faster_whisper import WhisperModel # pip install faster-whisper
	# Force CPU to avoid CUDA/cuDNN issues on Windows
	FAST_WHISPER = WhisperModel(
	"base", # or "tiny" for faster CPU
	device="cpu",
	compute_type="int8" # good speed/quality trade-off on CPU
	)
	except Exception as e:
	print("faster-whisper unavailable:", e)
	FAST_WHISPER = None

	# --- multimodal deps ---
	import fitz # PyMuPDF
	import docx
	import mammoth
	from PIL import Image
	import pytesseract

	from sentence_transformers import SentenceTransformer
	import faiss

	# Optional: get audio duration
	AUDIO_DURATION_ENABLED = True
	try:
	from pydub import AudioSegment # pip install pydub; requires ffmpeg on system
	except Exception:
	AUDIO_DURATION_ENABLED = False

	# ---------------- Config ----------------
	# Code location (for templates/static)
	BASE_DIR = Path(__file__).parent.resolve()

	# Writable persistent storage on HF Spaces
	DATA_ROOT = Path(os.getenv("DATA_DIR", "/data")).resolve()
	os.makedirs(DATA_ROOT, exist_ok=True)

	DB_PATH = str(DATA_ROOT / "rag_local.db")
	MEDIA_DIR = str(DATA_ROOT / "ingested_media")
	INDEX_DIR = str(DATA_ROOT / "index_store")
	os.makedirs(MEDIA_DIR, exist_ok=True)
	os.makedirs(INDEX_DIR, exist_ok=True)


	DOC_EXT = {'.pdf', '.docx', '.doc', '.txt'}
	IMG_EXT = {'.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.webp'}
	AUDIO_EXT = {'.mp3', '.wav', '.m4a', '.aac', '.ogg', '.flac', '.webm'}
	ALLOWED_EXT = DOC_EXT \| IMG_EXT \| AUDIO_EXT

	# ---------------- DB setup ----------------
	conn = sqlite3.connect(DB_PATH, check_same_thread=False)
	c = conn.cursor()

	c.execute('''CREATE TABLE IF NOT EXISTS documents (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	file_name TEXT,
	file_type TEXT,
	file_hash TEXT UNIQUE,
	created_at TEXT,
	orig_path TEXT
	)''')

	c.execute('''CREATE TABLE IF NOT EXISTS text_chunks (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	document_id INTEGER,
	chunk_text TEXT,
	chunk_meta TEXT,
	FOREIGN KEY(document_id) REFERENCES documents(id)
	)''')

	c.execute('''CREATE TABLE IF NOT EXISTS images (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	document_id INTEGER,
	image_path TEXT,
	page_num INTEGER,
	FOREIGN KEY(document_id) REFERENCES documents(id)
	)''')

	# NEW: audio metadata table
	c.execute('''CREATE TABLE IF NOT EXISTS audios (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	document_id INTEGER,
	audio_path TEXT,
	duration_sec REAL,
	sample_rate INTEGER,
	FOREIGN KEY(document_id) REFERENCES documents(id)
	)''')

	conn.commit()

	# ---------------- Utilities ----------------
	def file_hash_bytes(b: bytes) -> str:
	return hashlib.sha256(b).hexdigest()

	def save_uploaded_file(file_storage, dest_dir=MEDIA_DIR) -> str:
	filename = secure_filename(file_storage.filename)
	dest = os.path.join(dest_dir, filename)
	count = 1
	base, ext = os.path.splitext(filename)
	while os.path.exists(dest):
	filename = f"{base}_{count}{ext}"
	dest = os.path.join(dest_dir, filename)
	count += 1
	file_storage.save(dest)
	return dest

	# ---------------- Extraction: docs/images ----------------
	def process_pdf(file_path: str) -> Tuple[str, List[Tuple[str,int]]]:
	doc = fitz.open(file_path)
	text_pages, images = [], []
	for page_num in range(len(doc)):
	page = doc[page_num]
	text_pages.append(page.get_text())
	for img_index, img in enumerate(page.get_images(full=True)):
	xref = img[0]
	pix = fitz.Pixmap(doc, xref)
	img_name = f'{Path(file_path).stem}_page{page_num}_{img_index}.png'
	img_path = os.path.join(MEDIA_DIR, img_name)
	if pix.n - pix.alpha < 4:
	pix.save(img_path)
	else:
	pix0 = fitz.Pixmap(fitz.csRGB, pix)
	pix0.save(img_path)
	pix0 = None
	pix = None
	images.append((img_path, page_num))
	return '\n'.join(text_pages), images

	def process_docx(file_path: str) -> Tuple[str, List[Tuple[str,int]]]:
	d = docx.Document(file_path)
	text = '\n'.join([p.text for p in d.paragraphs])
	return text, []

	def process_doc(file_path: str) -> Tuple[str, List[Tuple[str,int]]]:
	with open(file_path, 'rb') as f:
	result = mammoth.extract_raw_text(f)
	return result.value, []

	def ocr_image(file_path: str) -> Tuple[str, List[Tuple[str,int]]]:
	try:
	img = Image.open(file_path)
	text = pytesseract.image_to_string(img)
	return text, [(file_path, 0)]
	except Exception as e:
	print("OCR error:", e)
	return "", [(file_path, 0)]

	# ---------------- Extraction: audio ----------------
	def _audio_duration(file_path: str) -> float:
	if not AUDIO_DURATION_ENABLED:
	return None
	try:
	seg = AudioSegment.from_file(file_path)
	return round(len(seg) / 1000.0, 3) # seconds
	except Exception:
	return None

	def _transcribe_offline_faster_whisper(file_path: str) -> Tuple[str, int]:
	"""
	Returns (text, sample_rate) using faster-whisper if available.
	"""
	if FAST_WHISPER is None:
	return "", None
	try:
	segments, info = FAST_WHISPER.transcribe(file_path, beam_size=1)
	text = " ".join([s.text.strip() for s in segments if s.text])
	sr = getattr(info, "sample_rate", None)
	return text.strip(), sr
	except Exception as e:
	print("faster-whisper failed:", e)
	return "", None

	def _transcribe_openai(file_path: str) -> Tuple[str, int]:
	"""
	Returns (text, sample_rate). OpenAI Whisper returns only text; sample_rate unknown.
	"""
	if client is None:
	return "", None
	try:
	with open(file_path, "rb") as af:
	# Whisper-1 transcription
	resp = client.audio.transcriptions.create(
	model="whisper-1",
	file=af
	)
	text = getattr(resp, "text", None) or (resp.get("text") if isinstance(resp, dict) else "")
	return (text or "").strip(), None
	except Exception as e:
	print("OpenAI Whisper failed:", e)
	return "", None

	def process_audio(file_path: str) -> Tuple[str, float, int]:
	"""
	Returns (transcript_text, duration_sec, sample_rate)
	Tries faster-whisper (offline) first, then OpenAI Whisper. If both fail, returns empty text.
	"""
	duration = _audio_duration(file_path)
	# Try offline first (if installed)
	text, sr = _transcribe_offline_faster_whisper(file_path)
	if not text:
	# Try OpenAI Whisper if key present
	t2, sr2 = _transcribe_openai(file_path)
	text = t2
	sr = sr2
	return text, duration, (sr or None)

	# ---------------- Ingestion ----------------
	def ingest_file_disk(file_path: str):
	ext = Path(file_path).suffix.lower()
	with open(file_path, 'rb') as f:
	b = f.read()
	fh = file_hash_bytes(b)
	c.execute('SELECT id FROM documents WHERE file_hash=?', (fh,))
	if c.fetchone():
	return {"status": "exists", "file_hash": fh}

	text, images = "", []
	audio_meta = None # (duration_sec, sample_rate)

	try:
	if ext == '.pdf':
	text, images = process_pdf(file_path)
	elif ext == '.docx':
	text, images = process_docx(file_path)
	elif ext == '.doc':
	text, images = process_doc(file_path)
	elif ext in IMG_EXT:
	text, images = ocr_image(file_path)
	elif ext == '.txt':
	with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
	text = f.read()
	elif ext in AUDIO_EXT:
	# process audio: store file + (optional) transcript to text_chunks
	t, duration, sr = process_audio(file_path)
	text = t or "" # may be empty if transcription unavailable
	audio_meta = (duration, sr)
	else:
	return {"status": "unsupported", "ext": ext}
	except Exception as e:
	return {"status": "error", "error": str(e)}

	now = datetime.now(timezone.utc).isoformat()
	c.execute('INSERT INTO documents (file_name, file_type, file_hash, created_at, orig_path) VALUES (?, ?, ?, ?, ?)',
	(Path(file_path).name, ext, fh, now, file_path))
	doc_id = c.lastrowid
	conn.commit()

	# Save extracted text (including transcribed audio) if present
	if text and text.strip():
	meta = {"source": Path(file_path).name}
	if ext in AUDIO_EXT:
	meta["modality"] = "audio_transcript"
	c.execute('INSERT INTO text_chunks (document_id, chunk_text, chunk_meta) VALUES (?, ?, ?)',
	(doc_id, text, json.dumps(meta)))
	conn.commit()

	# Save image references (from PDFs or images)
	for img_path, page_num in images:
	c.execute('INSERT INTO images (document_id, image_path, page_num) VALUES (?, ?, ?)',
	(doc_id, img_path, page_num))
	conn.commit()

	# Save audio metadata if applicable
	if ext in AUDIO_EXT:
	dur, sr = (audio_meta or (None, None))
	c.execute('INSERT INTO audios (document_id, audio_path, duration_sec, sample_rate) VALUES (?, ?, ?, ?)',
	(doc_id, file_path, dur, sr))
	conn.commit()

	return {"status": "ingested", "doc_id": doc_id, "file_hash": fh, "file_type": ext}

	# ---------------- Embedding + FAISS ----------------
	EMBED_MODEL = SentenceTransformer('intfloat/e5-small-v2')
	INDEX_FILE = os.path.join(INDEX_DIR, 'faiss_e5_small.index')
	ID_MAP_FILE = os.path.join(INDEX_DIR, 'id_mapping.jsonl')

	def _chunk_text(text: str, max_chars=1000, overlap=200) -> List[str]:
	text = text.strip()
	if not text: return []
	chunks, start, L = [], 0, len(text)
	while start < L:
	end = min(start + max_chars, L)
	chunks.append(text[start:end].strip())
	if end == L: break
	start = end - overlap
	return chunks

	def build_faiss_index():
	c.execute('SELECT id, document_id, chunk_text, chunk_meta FROM text_chunks')
	rows = c.fetchall()
	pieces = []
	for row in rows:
	parent_id, doc_id, text, meta = row
	meta_dict = json.loads(meta) if meta else {}
	for piece in _chunk_text(text):
	pieces.append((parent_id, doc_id, piece, meta_dict))
	if not pieces: return {"status": "no_text"}

	texts = [p[2] for p in pieces]
	embeddings = EMBED_MODEL.encode(texts, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
	dim = embeddings.shape[1]
	index = faiss.IndexFlatIP(dim)
	index.add(embeddings)
	faiss.write_index(index, INDEX_FILE)

	mapping = []
	for i, p in enumerate(pieces):
	mapping.append({
	"faiss_idx": i,
	"parent_chunk_id": p[0],
	"document_id": p[1],
	"text": p[2],
	"meta": p[3]
	})
	with open(ID_MAP_FILE, 'w', encoding='utf-8') as f:
	for m in mapping:
	f.write(json.dumps(m, ensure_ascii=False) + '\n')
	return {"status": "built", "num_pieces": len(pieces)}

	def load_faiss_index():
	if os.path.exists(INDEX_FILE) and os.path.exists(ID_MAP_FILE):
	index = faiss.read_index(INDEX_FILE)
	id_map = [json.loads(line) for line in open(ID_MAP_FILE, 'r', encoding='utf-8')]
	return index, id_map
	return None, None

	def semantic_search(query: str, top_k=5):
	index, id_map = load_faiss_index()
	if index is None: return []
	q_emb = EMBED_MODEL.encode([query], convert_to_numpy=True, normalize_embeddings=True)
	D, I = index.search(q_emb, top_k)
	results = []
	for score, idx in zip(D[0], I[0]):
	if idx < 0: continue
	entry = id_map[idx]
	doc_id = entry['document_id']
	c.execute('SELECT file_name FROM documents WHERE id=?', (doc_id,))
	r = c.fetchone()
	fn = r[0] if r else None
	results.append({
	"faiss_idx": int(idx),
	"score": float(score),
	"document_id": int(doc_id),
	"file_name": fn,
	"text": entry['text'],
	"meta": entry.get('meta', {})
	})
	return results

	# ---------------- Summarization (grounded) ----------------
	def generate_grounded_summary(query: str, hits: List[dict], max_tokens=300) -> str:
	if OPENAI_API_KEY and client:
	prompt_snippets = []
	for i, h in enumerate(hits, start=1):
	src = h.get('file_name') or f"doc_{h['document_id']}"
	snippet = h.get('text', '').strip().replace('\n', ' ')
	if len(snippet) > 400: snippet = snippet[:400].rsplit(' ', 1)[0] + "…"
	prompt_snippets.append(f"[{i}] Source: {src}\nText: {snippet}")

	system = (
	"You are a helpful assistant that answers user queries using only the provided source snippets. "
	"Always include numbered citations like [1], [2]. If info not found, say so clearly."
	)
	user = f"User query: {query}\n\nSources:\n" + "\n\n".join(prompt_snippets)

	try:
	resp = client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": system},
	{"role": "user", "content": user}
	],
	max_tokens=max_tokens,
	temperature=0.0
	)
	return resp.choices[0].message.content.strip()
	except Exception as e:
	print("OpenAI summarization failed:", e)

	# fallback extractive
	lines = []
	for i, h in enumerate(hits, start=1):
	snippet = h.get('text', '').strip()
	if len(snippet) > 300: snippet = snippet[:300].rsplit(' ', 1)[0] + "…"
	lines.append(f"[{i}] {snippet}")
	return "Extractive snippets (top results):\n" + "\n\n".join(lines)

	# ---------------- Flask App ----------------
	app = Flask(__name__, static_folder=str(BASE_DIR / "static"), template_folder=str(BASE_DIR / "templates"))
	app.config['MAX_CONTENT_LENGTH'] = 200 * 1024 * 1024

	@app.route('/')
	def home():
	return render_template('index.html')

	@app.route('/media/<path:filename>')
	def media(filename):
	return send_from_directory(MEDIA_DIR, filename)

	@app.route('/api/upload', methods=['POST'])
	def api_upload():
	if 'file' not in request.files:
	return jsonify({"error": "no file part"}), 400
	f = request.files['file']
	if f.filename == '':
	return jsonify({"error": "no selected file"}), 400
	ext = Path(f.filename).suffix.lower()
	if ext not in ALLOWED_EXT:
	return jsonify({"error": "unsupported file type", "ext": ext}), 400
	saved = save_uploaded_file(f, dest_dir=MEDIA_DIR)
	result = ingest_file_disk(saved)
	return jsonify(result)

	@app.route('/api/build_index', methods=['POST'])
	def api_build_index():
	res = build_faiss_index()
	return jsonify(res)

	def json_safe(o):
	if isinstance(o, np.integer): return int(o)
	if isinstance(o, np.floating): return float(o)
	if isinstance(o, np.ndarray): return o.tolist()
	raise TypeError(f"Object {type(o)} not serializable")

	@app.route('/api/query', methods=['POST'])
	def api_query():
	data = request.json or {}
	q = (data.get('query') or "").strip()
	if not q:
	return jsonify({"error": "must provide 'query'"}), 400

	# fetch exactly one result
	hits = semantic_search(q, top_k=1)
	if not hits:
	# keep legacy shape so UI doesn't break
	return jsonify({"hits": [], "summary": "", "message": "no match"}), 200

	h = hits[0]
	doc_id = h['document_id']

	# ensure we always have a file_url
	c.execute('SELECT file_name FROM documents WHERE id=?', (doc_id,))
	r = c.fetchone()
	file_name = (r[0] if r else h.get('file_name')) or ""
	file_url = url_for('media', filename=file_name, _external=False) if file_name else None

	# return minimal, but compatible with UI
	minimal_hit = {
	"document_id": doc_id,
	"file_name": file_name,
	"file_url": file_url,
	"text": h.get("text", "")
	# (no score, no faiss_idx, no meta)
	}

	# if your UI also reads `summary`, reuse text there
	return jsonify({
	"hits": [minimal_hit],
	"summary": minimal_hit["text"]
	})

	@app.route('/api/list_docs', methods=['GET'])
	def api_list_docs():
	c.execute('SELECT id, file_name, file_type, created_at FROM documents ORDER BY id DESC')
	rows = c.fetchall()
	out = [{"id": r[0], "file_name": r[1], "file_type": r[2], "created_at": r[3],
	"url": url_for('media', filename=r[1], _external=False)} for r in rows]
	return jsonify(out)

	# if __name__ == '__main__':
	# print("🚀 Starting Flask app — open http://127.0.0.1:5000")
	# app.run(host='0.0.0.0', port=5000, debug=True)

	if __name__ == '__main__':
	import os
	port = int(os.environ.get("PORT", "7860"))
	print(f"🚀 Starting Flask app — open http://0.0.0.0:{port}")
	app.run(host='0.0.0.0', port=port, debug=False)