Spaces:
Sleeping
Sleeping
File size: 2,902 Bytes
ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 159faf0 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 ffa0f3d 7793bb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import os
import tempfile
from pathlib import Path
import pytest
def test_parse_txt_file():
"""Test parsing a simple text file"""
# Test will fail initially - we'll implement parser to make it pass
from src.ingestion.document_parser import DocumentParser
parser = DocumentParser()
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
f.write("This is a test policy document.\nIt has multiple lines.")
temp_path = f.name
try:
result = parser.parse_document(temp_path)
assert result["content"] == "This is a test policy document.\nIt has multiple lines."
assert result["metadata"]["filename"] == Path(temp_path).name
assert result["metadata"]["file_type"] == "txt"
finally:
os.unlink(temp_path)
def test_parse_markdown_file():
"""Test parsing a markdown file"""
from src.ingestion.document_parser import DocumentParser
parser = DocumentParser()
markdown_content = """# Policy Title
## Section 1
This is section content.
### Subsection
More content here."""
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
f.write(markdown_content)
temp_path = f.name
try:
result = parser.parse_document(temp_path)
assert "Policy Title" in result["content"]
assert "Section 1" in result["content"]
assert result["metadata"]["file_type"] == "md"
finally:
os.unlink(temp_path)
def test_parse_unsupported_format():
"""Test handling of unsupported file formats"""
from src.ingestion.document_parser import DocumentParser
parser = DocumentParser()
with pytest.raises(ValueError, match="Unsupported file format"):
parser.parse_document("test.xyz")
def test_parse_nonexistent_file():
"""Test handling of non-existent files"""
from src.ingestion.document_parser import DocumentParser
parser = DocumentParser()
with pytest.raises(FileNotFoundError):
parser.parse_document("nonexistent.txt")
def test_parse_real_policy_document():
"""Test parsing an actual policy document from our corpus"""
from src.ingestion.document_parser import DocumentParser
parser = DocumentParser()
# Use a real policy document from our corpus
policy_path = "synthetic_policies/employee_handbook.md"
result = parser.parse_document(policy_path)
# Verify content structure
assert "employee_handbook.md" in result["metadata"]["filename"]
assert result["metadata"]["file_type"] == "md"
assert "Employee Handbook" in result["content"]
assert "HR-POL-001" in result["content"]
assert len(result["content"]) > 100 # Should have substantial content
# Verify metadata completeness
assert "file_size" in result["metadata"]
assert "file_path" in result["metadata"]
assert result["metadata"]["file_size"] > 0
|