import os from pathlib import Path from typing import Any, Dict class DocumentParser: """Parser for different document formats in the policy corpus""" SUPPORTED_FORMATS = {".txt", ".md", ".markdown"} def parse_document(self, file_path: str) -> Dict[str, Any]: """ Parse a document and return content with metadata Args: file_path: Path to the document file Returns: Dict containing 'content' and 'metadata' Raises: FileNotFoundError: If file doesn't exist ValueError: If file format is unsupported """ path = Path(file_path) # Check file format first (before existence check) if path.suffix.lower() not in self.SUPPORTED_FORMATS: raise ValueError(f"Unsupported file format: {path.suffix}") if not path.exists(): raise FileNotFoundError(f"File not found: {file_path}") with open(file_path, "r", encoding="utf-8") as f: content = f.read() metadata = { "filename": path.name, "file_type": path.suffix.lstrip(".").lower(), "file_size": os.path.getsize(file_path), "file_path": str(path.absolute()), } return {"content": content, "metadata": metadata}