Spaces:
Sleeping
Sleeping
| import os | |
| import tempfile | |
| from pathlib import Path | |
| import pytest | |
| def test_parse_txt_file(): | |
| """Test parsing a simple text file""" | |
| # Test will fail initially - we'll implement parser to make it pass | |
| from src.ingestion.document_parser import DocumentParser | |
| parser = DocumentParser() | |
| with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f: | |
| f.write("This is a test policy document.\nIt has multiple lines.") | |
| temp_path = f.name | |
| try: | |
| result = parser.parse_document(temp_path) | |
| assert result["content"] == "This is a test policy document.\nIt has multiple lines." | |
| assert result["metadata"]["filename"] == Path(temp_path).name | |
| assert result["metadata"]["file_type"] == "txt" | |
| finally: | |
| os.unlink(temp_path) | |
| def test_parse_markdown_file(): | |
| """Test parsing a markdown file""" | |
| from src.ingestion.document_parser import DocumentParser | |
| parser = DocumentParser() | |
| markdown_content = """# Policy Title | |
| ## Section 1 | |
| This is section content. | |
| ### Subsection | |
| More content here.""" | |
| with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f: | |
| f.write(markdown_content) | |
| temp_path = f.name | |
| try: | |
| result = parser.parse_document(temp_path) | |
| assert "Policy Title" in result["content"] | |
| assert "Section 1" in result["content"] | |
| assert result["metadata"]["file_type"] == "md" | |
| finally: | |
| os.unlink(temp_path) | |
| def test_parse_unsupported_format(): | |
| """Test handling of unsupported file formats""" | |
| from src.ingestion.document_parser import DocumentParser | |
| parser = DocumentParser() | |
| with pytest.raises(ValueError, match="Unsupported file format"): | |
| parser.parse_document("test.xyz") | |
| def test_parse_nonexistent_file(): | |
| """Test handling of non-existent files""" | |
| from src.ingestion.document_parser import DocumentParser | |
| parser = DocumentParser() | |
| with pytest.raises(FileNotFoundError): | |
| parser.parse_document("nonexistent.txt") | |
| def test_parse_real_policy_document(): | |
| """Test parsing an actual policy document from our corpus""" | |
| from src.ingestion.document_parser import DocumentParser | |
| parser = DocumentParser() | |
| # Use a real policy document from our corpus | |
| policy_path = "synthetic_policies/employee_handbook.md" | |
| result = parser.parse_document(policy_path) | |
| # Verify content structure | |
| assert "employee_handbook.md" in result["metadata"]["filename"] | |
| assert result["metadata"]["file_type"] == "md" | |
| assert "Employee Handbook" in result["content"] | |
| assert "HR-POL-001" in result["content"] | |
| assert len(result["content"]) > 100 # Should have substantial content | |
| # Verify metadata completeness | |
| assert "file_size" in result["metadata"] | |
| assert "file_path" in result["metadata"] | |
| assert result["metadata"]["file_size"] > 0 | |