msse-ai-engineering / src /ingestion /document_parser.py
Tobias Pasquale
style: Fix code formatting and linting issues for CI/CD compliance
7793bb6
import os
from pathlib import Path
from typing import Any, Dict
class DocumentParser:
"""Parser for different document formats in the policy corpus"""
SUPPORTED_FORMATS = {".txt", ".md", ".markdown"}
def parse_document(self, file_path: str) -> Dict[str, Any]:
"""
Parse a document and return content with metadata
Args:
file_path: Path to the document file
Returns:
Dict containing 'content' and 'metadata'
Raises:
FileNotFoundError: If file doesn't exist
ValueError: If file format is unsupported
"""
path = Path(file_path)
# Check file format first (before existence check)
if path.suffix.lower() not in self.SUPPORTED_FORMATS:
raise ValueError(f"Unsupported file format: {path.suffix}")
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
metadata = {
"filename": path.name,
"file_type": path.suffix.lstrip(".").lower(),
"file_size": os.path.getsize(file_path),
"file_path": str(path.absolute()),
}
return {"content": content, "metadata": metadata}