SmartPagerankSearch / tests /test_hierarchy_engine.py
GitHub Action
Sync from GitHub Actions (Clean Commit)
7f22d3c
import unittest
import numpy as np
import os
import shutil
from hierarchy_engine import HierarchicalIndex
class TestHierarchicalIndex(unittest.TestCase):
def setUp(self):
self.test_dir = "test_hierarchy_data"
os.makedirs(self.test_dir, exist_ok=True)
self.index_path = os.path.join(self.test_dir, "test_index.pkl")
# Create dummy data
# 100 vectors, 128 dimensions
self.vectors = np.random.rand(100, 128).astype('float32')
self.ids = [f"id_{i}" for i in range(100)]
self.payloads = [{"info": f"data_{i}"} for i in range(100)]
def tearDown(self):
if os.path.exists(self.test_dir):
shutil.rmtree(self.test_dir)
def test_build_and_search(self):
# Initialize index with small cluster numbers for testing
index = HierarchicalIndex(layer2_clusters=5, layer1_clusters=3)
# Build index
index.build(self.vectors, self.ids, self.payloads)
# Check if layers are populated
self.assertIsNotNone(index.layer2_centroids)
self.assertTrue(len(index.layer1_centroids) > 0)
self.assertTrue(len(index.layer1_children) > 0)
# Test Search
query_vector = self.vectors[0] # Search for the first vector itself
results = index.search(query_vector, top_k=5, beam_size=5)
self.assertTrue(len(results) > 0)
# The first result should ideally be the vector itself (id_0) or very close
# Note: K-Means is heuristic, so it might not always find the exact match if beam size is small,
# but with beam_size=5 and small data, it likely will.
found_ids = [r['id'] for r in results]
self.assertIn("id_0", found_ids)
# Check result structure
self.assertIn('score', results[0])
self.assertIn('payload', results[0])
self.assertEqual(results[0]['payload']['info'], "data_0")
def test_save_and_load(self):
index = HierarchicalIndex(layer2_clusters=2, layer1_clusters=2)
index.build(self.vectors, self.ids, self.payloads)
index.save(self.index_path)
loaded_index = HierarchicalIndex.load(self.index_path)
self.assertEqual(loaded_index.layer2_k, index.layer2_k)
self.assertEqual(len(loaded_index.ids), len(index.ids))
# Verify search works on loaded index
query_vector = self.vectors[10]
results = loaded_index.search(query_vector, top_k=1)
self.assertEqual(results[0]['id'], "id_10")
def test_empty_build(self):
# Test robustness with empty data
index = HierarchicalIndex()
# Should probably handle this gracefully or raise specific error
# Based on code, KMeans will raise error if n_samples < n_clusters
# So we expect an error or we should catch it.
# For now let's just see if it crashes hard or standard exception.
pass
if __name__ == '__main__':
unittest.main()