Spaces:

sethmcknight
/

msse-ai-engineering

Sleeping

File size: 1,315 Bytes

ffa0f3d
 
7793bb6
 
ffa0f3d
 
 
7793bb6
 
 
ffa0f3d
 
 
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
 
 
 
7793bb6
ffa0f3d
 
 
7793bb6
ffa0f3d
 
7793bb6
 
ffa0f3d
7793bb6
ffa0f3d
7793bb6
 
 
 
ffa0f3d
7793bb6

import os
from pathlib import Path
from typing import Any, Dict


class DocumentParser:
    """Parser for different document formats in the policy corpus"""

    SUPPORTED_FORMATS = {".txt", ".md", ".markdown"}

    def parse_document(self, file_path: str) -> Dict[str, Any]:
        """
        Parse a document and return content with metadata

        Args:
            file_path: Path to the document file

        Returns:
            Dict containing 'content' and 'metadata'

        Raises:
            FileNotFoundError: If file doesn't exist
            ValueError: If file format is unsupported
        """
        path = Path(file_path)

        # Check file format first (before existence check)
        if path.suffix.lower() not in self.SUPPORTED_FORMATS:
            raise ValueError(f"Unsupported file format: {path.suffix}")

        if not path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        metadata = {
            "filename": path.name,
            "file_type": path.suffix.lstrip(".").lower(),
            "file_size": os.path.getsize(file_path),
            "file_path": str(path.absolute()),
        }

        return {"content": content, "metadata": metadata}