| from sentence_transformers import SentenceTransformer | |
| import hdbscan | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| def cluster_items(items, min_cluster_size=2): | |
| texts = [item["title"] for item in items] | |
| if not texts: | |
| return [] | |
| embeddings = model.encode(texts) | |
| clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, metric="euclidean") | |
| labels = clusterer.fit_predict(embeddings) | |
| clusters = {} | |
| for label, text in zip(labels, texts): | |
| if label == -1: | |
| continue | |
| clusters.setdefault(label, []).append(text) | |
| cluster_text = [] | |
| for i, titles in clusters.items(): | |
| cluster_text.append(f"🔸 Cluster {i} ({len(titles)} items):") | |
| cluster_text.extend(f"- {t}" for t in titles) | |
| cluster_text.append("") # add space between clusters | |
| return "\n".join(cluster_text) or "No meaningful clusters found." | |