File size: 2,902 Bytes
ffa0f3d
7793bb6
ffa0f3d
 
7793bb6
 
 
ffa0f3d
 
 
 
7793bb6
ffa0f3d
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
159faf0
7793bb6
 
ffa0f3d
 
 
7793bb6
ffa0f3d
 
 
7793bb6
ffa0f3d
 
7793bb6
ffa0f3d
 
 
 
 
7793bb6
 
ffa0f3d
 
7793bb6
ffa0f3d
 
7793bb6
 
 
ffa0f3d
 
 
7793bb6
ffa0f3d
 
 
7793bb6
ffa0f3d
 
 
 
7793bb6
ffa0f3d
 
 
7793bb6
ffa0f3d
 
 
 
7793bb6
ffa0f3d
 
 
7793bb6
ffa0f3d
 
 
7793bb6
ffa0f3d
7793bb6
ffa0f3d
7793bb6
 
 
 
 
 
ffa0f3d
7793bb6
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
import tempfile
from pathlib import Path

import pytest


def test_parse_txt_file():
    """Test parsing a simple text file"""
    # Test will fail initially - we'll implement parser to make it pass
    from src.ingestion.document_parser import DocumentParser

    parser = DocumentParser()
    with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
        f.write("This is a test policy document.\nIt has multiple lines.")
        temp_path = f.name

    try:
        result = parser.parse_document(temp_path)
        assert result["content"] == "This is a test policy document.\nIt has multiple lines."
        assert result["metadata"]["filename"] == Path(temp_path).name
        assert result["metadata"]["file_type"] == "txt"
    finally:
        os.unlink(temp_path)


def test_parse_markdown_file():
    """Test parsing a markdown file"""
    from src.ingestion.document_parser import DocumentParser

    parser = DocumentParser()
    markdown_content = """# Policy Title

## Section 1
This is section content.

### Subsection
More content here."""

    with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
        f.write(markdown_content)
        temp_path = f.name

    try:
        result = parser.parse_document(temp_path)
        assert "Policy Title" in result["content"]
        assert "Section 1" in result["content"]
        assert result["metadata"]["file_type"] == "md"
    finally:
        os.unlink(temp_path)


def test_parse_unsupported_format():
    """Test handling of unsupported file formats"""
    from src.ingestion.document_parser import DocumentParser

    parser = DocumentParser()
    with pytest.raises(ValueError, match="Unsupported file format"):
        parser.parse_document("test.xyz")


def test_parse_nonexistent_file():
    """Test handling of non-existent files"""
    from src.ingestion.document_parser import DocumentParser

    parser = DocumentParser()
    with pytest.raises(FileNotFoundError):
        parser.parse_document("nonexistent.txt")


def test_parse_real_policy_document():
    """Test parsing an actual policy document from our corpus"""
    from src.ingestion.document_parser import DocumentParser

    parser = DocumentParser()
    # Use a real policy document from our corpus
    policy_path = "synthetic_policies/employee_handbook.md"

    result = parser.parse_document(policy_path)

    # Verify content structure
    assert "employee_handbook.md" in result["metadata"]["filename"]
    assert result["metadata"]["file_type"] == "md"
    assert "Employee Handbook" in result["content"]
    assert "HR-POL-001" in result["content"]
    assert len(result["content"]) > 100  # Should have substantial content

    # Verify metadata completeness
    assert "file_size" in result["metadata"]
    assert "file_path" in result["metadata"]
    assert result["metadata"]["file_size"] > 0