Spaces:
Sleeping
Sleeping
| import os | |
| from pathlib import Path | |
| from typing import Any, Dict | |
| class DocumentParser: | |
| """Parser for different document formats in the policy corpus""" | |
| SUPPORTED_FORMATS = {".txt", ".md", ".markdown"} | |
| def parse_document(self, file_path: str) -> Dict[str, Any]: | |
| """ | |
| Parse a document and return content with metadata | |
| Args: | |
| file_path: Path to the document file | |
| Returns: | |
| Dict containing 'content' and 'metadata' | |
| Raises: | |
| FileNotFoundError: If file doesn't exist | |
| ValueError: If file format is unsupported | |
| """ | |
| path = Path(file_path) | |
| # Check file format first (before existence check) | |
| if path.suffix.lower() not in self.SUPPORTED_FORMATS: | |
| raise ValueError(f"Unsupported file format: {path.suffix}") | |
| if not path.exists(): | |
| raise FileNotFoundError(f"File not found: {file_path}") | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| metadata = { | |
| "filename": path.name, | |
| "file_type": path.suffix.lstrip(".").lower(), | |
| "file_size": os.path.getsize(file_path), | |
| "file_path": str(path.absolute()), | |
| } | |
| return {"content": content, "metadata": metadata} | |