Spaces:

TUM
/

SmartPagerankSearch

Sleeping

SmartPagerankSearch / tests /test_hierarchy_engine.py

GitHub Action

Sync from GitHub Actions (Clean Commit)

7f22d3c 17 days ago

3.03 kB

	import unittest
	import numpy as np
	import os
	import shutil
	from hierarchy_engine import HierarchicalIndex

	class TestHierarchicalIndex(unittest.TestCase):
	def setUp(self):
	self.test_dir = "test_hierarchy_data"
	os.makedirs(self.test_dir, exist_ok=True)
	self.index_path = os.path.join(self.test_dir, "test_index.pkl")

	# Create dummy data
	# 100 vectors, 128 dimensions
	self.vectors = np.random.rand(100, 128).astype('float32')
	self.ids = [f"id_{i}" for i in range(100)]
	self.payloads = [{"info": f"data_{i}"} for i in range(100)]

	def tearDown(self):
	if os.path.exists(self.test_dir):
	shutil.rmtree(self.test_dir)

	def test_build_and_search(self):
	# Initialize index with small cluster numbers for testing
	index = HierarchicalIndex(layer2_clusters=5, layer1_clusters=3)

	# Build index
	index.build(self.vectors, self.ids, self.payloads)

	# Check if layers are populated
	self.assertIsNotNone(index.layer2_centroids)
	self.assertTrue(len(index.layer1_centroids) > 0)
	self.assertTrue(len(index.layer1_children) > 0)

	# Test Search
	query_vector = self.vectors[0] # Search for the first vector itself
	results = index.search(query_vector, top_k=5, beam_size=5)

	self.assertTrue(len(results) > 0)
	# The first result should ideally be the vector itself (id_0) or very close
	# Note: K-Means is heuristic, so it might not always find the exact match if beam size is small,
	# but with beam_size=5 and small data, it likely will.
	found_ids = [r['id'] for r in results]
	self.assertIn("id_0", found_ids)

	# Check result structure
	self.assertIn('score', results[0])
	self.assertIn('payload', results[0])
	self.assertEqual(results[0]['payload']['info'], "data_0")

	def test_save_and_load(self):
	index = HierarchicalIndex(layer2_clusters=2, layer1_clusters=2)
	index.build(self.vectors, self.ids, self.payloads)

	index.save(self.index_path)

	loaded_index = HierarchicalIndex.load(self.index_path)

	self.assertEqual(loaded_index.layer2_k, index.layer2_k)
	self.assertEqual(len(loaded_index.ids), len(index.ids))

	# Verify search works on loaded index
	query_vector = self.vectors[10]
	results = loaded_index.search(query_vector, top_k=1)
	self.assertEqual(results[0]['id'], "id_10")

	def test_empty_build(self):
	# Test robustness with empty data
	index = HierarchicalIndex()
	# Should probably handle this gracefully or raise specific error
	# Based on code, KMeans will raise error if n_samples < n_clusters
	# So we expect an error or we should catch it.
	# For now let's just see if it crashes hard or standard exception.
	pass

	if __name__ == '__main__':
	unittest.main()