Spaces:

simondh
/

classifieur

Sleeping

App Files Files Community

classifieur / classifiers.py

simondh

black .

6f39808 8 months ago

raw

history blame

9.94 kB

	import numpy as np
	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.cluster import KMeans
	from sklearn.metrics.pairwise import cosine_similarity
	import random
	import json
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from typing import List, Dict, Any, Optional
	from prompts import CATEGORY_SUGGESTION_PROMPT, TEXT_CLASSIFICATION_PROMPT


	class BaseClassifier:
	"""Base class for text classifiers"""

	def __init__(self):
	pass

	def classify(self, texts, categories=None):
	"""
	Classify a list of texts into categories

	Args:
	texts (list): List of text strings to classify
	categories (list, optional): List of category names. If None, categories will be auto-detected

	Returns:
	list: List of classification results with categories, confidence scores, and explanations
	"""
	raise NotImplementedError("Subclasses must implement this method")

	def _generate_default_categories(self, texts, num_clusters=5):
	"""
	Generate default categories based on text clustering

	Args:
	texts (list): List of text strings
	num_clusters (int): Number of clusters to generate

	Returns:
	list: List of category names
	"""
	# Simple implementation - in real system this would be more sophisticated
	default_categories = [f"Category {i+1}" for i in range(num_clusters)]
	return default_categories


	class TFIDFClassifier(BaseClassifier):
	"""Classifier using TF-IDF and clustering for fast classification"""

	def __init__(self):
	super().__init__()
	self.vectorizer = TfidfVectorizer(
	max_features=1000, stop_words="english", ngram_range=(1, 2)
	)
	self.model = None
	self.feature_names = None
	self.categories = None
	self.centroids = None

	def classify(self, texts, categories=None):
	"""Classify texts using TF-IDF and clustering"""
	# Vectorize the texts
	X = self.vectorizer.fit_transform(texts)
	self.feature_names = self.vectorizer.get_feature_names_out()

	# Auto-detect categories if not provided
	if not categories:
	num_clusters = min(5, len(texts)) # Don't create more clusters than texts
	self.categories = self._generate_default_categories(texts, num_clusters)
	else:
	self.categories = categories
	num_clusters = len(categories)

	# Cluster the texts
	self.model = KMeans(n_clusters=num_clusters, random_state=42)
	clusters = self.model.fit_predict(X)
	self.centroids = self.model.cluster_centers_

	# Calculate distances to centroids for confidence
	distances = self._calculate_distances(X)

	# Prepare results
	results = []
	for i, text in enumerate(texts):
	cluster_idx = clusters[i]

	# Calculate confidence (inverse of distance, normalized)
	confidence = self._calculate_confidence(distances[i])

	# Create explanation
	explanation = self._generate_explanation(X[i], cluster_idx)

	results.append(
	{
	"category": self.categories[cluster_idx],
	"confidence": confidence,
	"explanation": explanation,
	}
	)

	return results

	def _calculate_distances(self, X):
	"""Calculate distances from each point to each centroid"""
	return np.sqrt(
	(
	(X.toarray()[:, np.newaxis, :] - self.centroids[np.newaxis, :, :]) ** 2
	).sum(axis=2)
	)

	def _calculate_confidence(self, distances):
	"""Convert distances to confidence scores (0-100)"""
	min_dist = np.min(distances)
	max_dist = np.max(distances)

	# Normalize and invert (smaller distance = higher confidence)
	if max_dist == min_dist:
	return 70 # Default mid-range confidence when all distances are equal

	normalized_dist = (distances - min_dist) / (max_dist - min_dist)
	min_normalized = np.min(normalized_dist)

	# Invert and scale to 50-100 range (TF-IDF is never 100% confident)
	confidence = 100 - (min_normalized * 50)
	return round(confidence, 1)

	def _generate_explanation(self, text_vector, cluster_idx):
	"""Generate an explanation for the classification"""
	# Get the most important features for this cluster
	centroid = self.centroids[cluster_idx]

	# Get indices of top features for this text
	text_array = text_vector.toarray()[0]
	top_indices = text_array.argsort()[-5:][::-1]

	# Get the feature names for these indices
	top_features = [self.feature_names[i] for i in top_indices if text_array[i] > 0]

	if not top_features:
	return "No significant features identified for this classification."

	explanation = f"Classification based on key terms: {', '.join(top_features)}"
	return explanation


	class LLMClassifier(BaseClassifier):
	"""Classifier using a Large Language Model for more accurate but slower classification"""

	def __init__(self, client, model="gpt-3.5-turbo"):
	super().__init__()
	self.client = client
	self.model = model

	def classify(
	self, texts: List[str], categories: Optional[List[str]] = None
	) -> List[Dict[str, Any]]:
	"""Classify texts using an LLM with parallel processing"""
	if not categories:
	# First, use LLM to generate appropriate categories
	categories = self._suggest_categories(texts)

	# Process texts in parallel
	with ThreadPoolExecutor(max_workers=10) as executor:
	# Submit all tasks with their original indices
	future_to_index = {
	executor.submit(self._classify_text, text, categories): idx
	for idx, text in enumerate(texts)
	}

	# Initialize results list with None values
	results = [None] * len(texts)

	# Collect results as they complete
	for future in as_completed(future_to_index):
	original_idx = future_to_index[future]
	try:
	result = future.result()
	results[original_idx] = result
	except Exception as e:
	print(f"Error processing text: {str(e)}")
	results[original_idx] = {
	"category": categories[0],
	"confidence": 50,
	"explanation": f"Error during classification: {str(e)}",
	}

	return results

	def _suggest_categories(self, texts: List[str], sample_size: int = 20) -> List[str]:
	"""Use LLM to suggest appropriate categories for the dataset"""
	# Take a sample of texts to avoid token limitations
	if len(texts) > sample_size:
	sample_texts = random.sample(texts, sample_size)
	else:
	sample_texts = texts

	prompt = CATEGORY_SUGGESTION_PROMPT.format("\n---\n".join(sample_texts))

	try:
	response = self.client.chat.completions.create(
	model=self.model,
	messages=[{"role": "user", "content": prompt}],
	temperature=0.2,
	max_tokens=100,
	)

	# Parse response to get categories
	categories_text = response.choices[0].message.content.strip()
	categories = [cat.strip() for cat in categories_text.split(",")]

	return categories
	except Exception as e:
	# Fallback to default categories on error
	print(f"Error suggesting categories: {str(e)}")
	return self._generate_default_categories(texts)

	def _classify_text(self, text: str, categories: List[str]) -> Dict[str, Any]:
	"""Use LLM to classify a single text"""
	prompt = TEXT_CLASSIFICATION_PROMPT.format(
	categories=", ".join(categories), text=text
	)

	try:
	response = self.client.chat.completions.create(
	model=self.model,
	messages=[{"role": "user", "content": prompt}],
	temperature=0,
	max_tokens=200,
	)

	# Parse JSON response
	response_text = response.choices[0].message.content.strip()

	result = json.loads(response_text)
	# Ensure all required fields are present
	if not all(k in result for k in ["category", "confidence", "explanation"]):
	raise ValueError("Missing required fields in LLM response")

	# Validate category is in the list
	if result["category"] not in categories:
	result["category"] = categories[
	0
	] # Default to first category if invalid

	# Validate confidence is a number between 0 and 100
	try:
	result["confidence"] = float(result["confidence"])
	if not 0 <= result["confidence"] <= 100:
	result["confidence"] = 50
	except:
	result["confidence"] = 50

	return result
	except json.JSONDecodeError:
	# Fall back to simple parsing if JSON fails
	category = categories[0] # Default
	for cat in categories:
	if cat.lower() in response_text.lower():
	category = cat
	break

	return {
	"category": category,
	"confidence": 50,
	"explanation": f"Classification based on language model analysis. (Note: Structured response parsing failed)",
	}