File size: 1,315 Bytes
ffa0f3d
 
7793bb6
 
ffa0f3d
 
 
7793bb6
 
 
ffa0f3d
 
 
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
 
 
 
7793bb6
ffa0f3d
 
 
7793bb6
ffa0f3d
 
7793bb6
 
ffa0f3d
7793bb6
ffa0f3d
7793bb6
 
 
 
ffa0f3d
7793bb6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os
from pathlib import Path
from typing import Any, Dict


class DocumentParser:
    """Parser for different document formats in the policy corpus"""

    SUPPORTED_FORMATS = {".txt", ".md", ".markdown"}

    def parse_document(self, file_path: str) -> Dict[str, Any]:
        """
        Parse a document and return content with metadata

        Args:
            file_path: Path to the document file

        Returns:
            Dict containing 'content' and 'metadata'

        Raises:
            FileNotFoundError: If file doesn't exist
            ValueError: If file format is unsupported
        """
        path = Path(file_path)

        # Check file format first (before existence check)
        if path.suffix.lower() not in self.SUPPORTED_FORMATS:
            raise ValueError(f"Unsupported file format: {path.suffix}")

        if not path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        with open(file_path, "r", encoding="utf-8") as f:
            content = f.read()

        metadata = {
            "filename": path.name,
            "file_type": path.suffix.lstrip(".").lower(),
            "file_size": os.path.getsize(file_path),
            "file_path": str(path.absolute()),
        }

        return {"content": content, "metadata": metadata}