msse-ai-engineering / tests /test_ingestion /test_document_parser.py
sethmcknight
Refactor test cases for improved readability and consistency
159faf0
import os
import tempfile
from pathlib import Path
import pytest
def test_parse_txt_file():
"""Test parsing a simple text file"""
# Test will fail initially - we'll implement parser to make it pass
from src.ingestion.document_parser import DocumentParser
parser = DocumentParser()
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
f.write("This is a test policy document.\nIt has multiple lines.")
temp_path = f.name
try:
result = parser.parse_document(temp_path)
assert result["content"] == "This is a test policy document.\nIt has multiple lines."
assert result["metadata"]["filename"] == Path(temp_path).name
assert result["metadata"]["file_type"] == "txt"
finally:
os.unlink(temp_path)
def test_parse_markdown_file():
"""Test parsing a markdown file"""
from src.ingestion.document_parser import DocumentParser
parser = DocumentParser()
markdown_content = """# Policy Title
## Section 1
This is section content.
### Subsection
More content here."""
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
f.write(markdown_content)
temp_path = f.name
try:
result = parser.parse_document(temp_path)
assert "Policy Title" in result["content"]
assert "Section 1" in result["content"]
assert result["metadata"]["file_type"] == "md"
finally:
os.unlink(temp_path)
def test_parse_unsupported_format():
"""Test handling of unsupported file formats"""
from src.ingestion.document_parser import DocumentParser
parser = DocumentParser()
with pytest.raises(ValueError, match="Unsupported file format"):
parser.parse_document("test.xyz")
def test_parse_nonexistent_file():
"""Test handling of non-existent files"""
from src.ingestion.document_parser import DocumentParser
parser = DocumentParser()
with pytest.raises(FileNotFoundError):
parser.parse_document("nonexistent.txt")
def test_parse_real_policy_document():
"""Test parsing an actual policy document from our corpus"""
from src.ingestion.document_parser import DocumentParser
parser = DocumentParser()
# Use a real policy document from our corpus
policy_path = "synthetic_policies/employee_handbook.md"
result = parser.parse_document(policy_path)
# Verify content structure
assert "employee_handbook.md" in result["metadata"]["filename"]
assert result["metadata"]["file_type"] == "md"
assert "Employee Handbook" in result["content"]
assert "HR-POL-001" in result["content"]
assert len(result["content"]) > 100 # Should have substantial content
# Verify metadata completeness
assert "file_size" in result["metadata"]
assert "file_path" in result["metadata"]
assert result["metadata"]["file_size"] > 0