Spaces:

sethmcknight
/

msse-ai-engineering

Sleeping

msse-ai-engineering / tests /test_ingestion /test_document_parser.py

sethmcknight

Refactor test cases for improved readability and consistency

159faf0 about 2 months ago

2.9 kB

	import os
	import tempfile
	from pathlib import Path

	import pytest


	def test_parse_txt_file():
	"""Test parsing a simple text file"""
	# Test will fail initially - we'll implement parser to make it pass
	from src.ingestion.document_parser import DocumentParser

	parser = DocumentParser()
	with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
	f.write("This is a test policy document.\nIt has multiple lines.")
	temp_path = f.name

	try:
	result = parser.parse_document(temp_path)
	assert result["content"] == "This is a test policy document.\nIt has multiple lines."
	assert result["metadata"]["filename"] == Path(temp_path).name
	assert result["metadata"]["file_type"] == "txt"
	finally:
	os.unlink(temp_path)


	def test_parse_markdown_file():
	"""Test parsing a markdown file"""
	from src.ingestion.document_parser import DocumentParser

	parser = DocumentParser()
	markdown_content = """# Policy Title

	## Section 1
	This is section content.

	### Subsection
	More content here."""

	with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
	f.write(markdown_content)
	temp_path = f.name

	try:
	result = parser.parse_document(temp_path)
	assert "Policy Title" in result["content"]
	assert "Section 1" in result["content"]
	assert result["metadata"]["file_type"] == "md"
	finally:
	os.unlink(temp_path)


	def test_parse_unsupported_format():
	"""Test handling of unsupported file formats"""
	from src.ingestion.document_parser import DocumentParser

	parser = DocumentParser()
	with pytest.raises(ValueError, match="Unsupported file format"):
	parser.parse_document("test.xyz")


	def test_parse_nonexistent_file():
	"""Test handling of non-existent files"""
	from src.ingestion.document_parser import DocumentParser

	parser = DocumentParser()
	with pytest.raises(FileNotFoundError):
	parser.parse_document("nonexistent.txt")


	def test_parse_real_policy_document():
	"""Test parsing an actual policy document from our corpus"""
	from src.ingestion.document_parser import DocumentParser

	parser = DocumentParser()
	# Use a real policy document from our corpus
	policy_path = "synthetic_policies/employee_handbook.md"

	result = parser.parse_document(policy_path)

	# Verify content structure
	assert "employee_handbook.md" in result["metadata"]["filename"]
	assert result["metadata"]["file_type"] == "md"
	assert "Employee Handbook" in result["content"]
	assert "HR-POL-001" in result["content"]
	assert len(result["content"]) > 100 # Should have substantial content

	# Verify metadata completeness
	assert "file_size" in result["metadata"]
	assert "file_path" in result["metadata"]
	assert result["metadata"]["file_size"] > 0