Spaces:
Sleeping
Sleeping
File size: 1,315 Bytes
ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import os
from pathlib import Path
from typing import Any, Dict
class DocumentParser:
"""Parser for different document formats in the policy corpus"""
SUPPORTED_FORMATS = {".txt", ".md", ".markdown"}
def parse_document(self, file_path: str) -> Dict[str, Any]:
"""
Parse a document and return content with metadata
Args:
file_path: Path to the document file
Returns:
Dict containing 'content' and 'metadata'
Raises:
FileNotFoundError: If file doesn't exist
ValueError: If file format is unsupported
"""
path = Path(file_path)
# Check file format first (before existence check)
if path.suffix.lower() not in self.SUPPORTED_FORMATS:
raise ValueError(f"Unsupported file format: {path.suffix}")
if not path.exists():
raise FileNotFoundError(f"File not found: {file_path}")
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
metadata = {
"filename": path.name,
"file_type": path.suffix.lstrip(".").lower(),
"file_size": os.path.getsize(file_path),
"file_path": str(path.absolute()),
}
return {"content": content, "metadata": metadata}
|