Spaces:
Sleeping
Sleeping
add parallization
Browse files- app.py +17 -11
- classifiers.py +27 -9
- utils.py +10 -0
app.py
CHANGED
|
@@ -12,6 +12,7 @@ import time
|
|
| 12 |
import torch
|
| 13 |
import traceback
|
| 14 |
import logging
|
|
|
|
| 15 |
|
| 16 |
# Import local modules
|
| 17 |
from classifiers import TFIDFClassifier, LLMClassifier
|
|
@@ -106,15 +107,11 @@ def process_file(file, text_columns, categories, classifier_type, show_explanati
|
|
| 106 |
if classifier_type == "tfidf":
|
| 107 |
classifier = TFIDFClassifier()
|
| 108 |
results = classifier.classify(texts, category_list)
|
| 109 |
-
elif classifier_type
|
| 110 |
if client is None:
|
| 111 |
return None, "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'."
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
elif classifier_type == "gpt4":
|
| 115 |
-
if client is None:
|
| 116 |
-
return None, "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'."
|
| 117 |
-
classifier = LLMClassifier(client=client, model="gpt-4")
|
| 118 |
results = classifier.classify(texts, category_list)
|
| 119 |
else: # hybrid
|
| 120 |
if client is None:
|
|
@@ -126,12 +123,21 @@ def process_file(file, text_columns, categories, classifier_type, show_explanati
|
|
| 126 |
# Second pass with LLM for low confidence results
|
| 127 |
llm_classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
|
| 128 |
results = []
|
|
|
|
|
|
|
|
|
|
| 129 |
for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)):
|
| 130 |
if tfidf_result["confidence"] < 70: # If confidence is below 70%
|
| 131 |
-
|
| 132 |
-
|
|
|
|
| 133 |
else:
|
| 134 |
results.append(tfidf_result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
# Create results dataframe
|
| 137 |
result_df = df.copy()
|
|
@@ -364,7 +370,7 @@ with gr.Blocks(title="Text Classification System") as demo:
|
|
| 364 |
def show_results(df, validation_report):
|
| 365 |
"""Show the results after processing"""
|
| 366 |
if df is None:
|
| 367 |
-
return gr.Row(visible=False), gr.File(visible=False), gr.File(visible=False), gr.Dataframe(visible=False)
|
| 368 |
|
| 369 |
# Sort by category if it exists
|
| 370 |
if "Category" in df.columns:
|
|
@@ -374,7 +380,7 @@ with gr.Blocks(title="Text Classification System") as demo:
|
|
| 374 |
csv_path = export_results(df, "csv")
|
| 375 |
excel_path = export_results(df, "excel")
|
| 376 |
|
| 377 |
-
return gr.Row(visible=True), gr.File(value=csv_path, visible=True), gr.File(value=excel_path, visible=True), gr.Dataframe(value=df, visible=True)
|
| 378 |
|
| 379 |
# Function to suggest a new category
|
| 380 |
def suggest_new_category(file, current_categories, text_columns):
|
|
|
|
| 12 |
import torch
|
| 13 |
import traceback
|
| 14 |
import logging
|
| 15 |
+
import asyncio
|
| 16 |
|
| 17 |
# Import local modules
|
| 18 |
from classifiers import TFIDFClassifier, LLMClassifier
|
|
|
|
| 107 |
if classifier_type == "tfidf":
|
| 108 |
classifier = TFIDFClassifier()
|
| 109 |
results = classifier.classify(texts, category_list)
|
| 110 |
+
elif classifier_type in ["gpt35", "gpt4"]:
|
| 111 |
if client is None:
|
| 112 |
return None, "Erreur : Le client API n'est pas initialisé. Veuillez configurer une clé API valide dans l'onglet 'Setup'."
|
| 113 |
+
model = "gpt-3.5-turbo" if classifier_type == "gpt35" else "gpt-4"
|
| 114 |
+
classifier = LLMClassifier(client=client, model=model)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
results = classifier.classify(texts, category_list)
|
| 116 |
else: # hybrid
|
| 117 |
if client is None:
|
|
|
|
| 123 |
# Second pass with LLM for low confidence results
|
| 124 |
llm_classifier = LLMClassifier(client=client, model="gpt-3.5-turbo")
|
| 125 |
results = []
|
| 126 |
+
low_confidence_texts = []
|
| 127 |
+
low_confidence_indices = []
|
| 128 |
+
|
| 129 |
for i, (text, tfidf_result) in enumerate(zip(texts, tfidf_results)):
|
| 130 |
if tfidf_result["confidence"] < 70: # If confidence is below 70%
|
| 131 |
+
low_confidence_texts.append(text)
|
| 132 |
+
low_confidence_indices.append(i)
|
| 133 |
+
results.append(None) # Placeholder
|
| 134 |
else:
|
| 135 |
results.append(tfidf_result)
|
| 136 |
+
|
| 137 |
+
if low_confidence_texts:
|
| 138 |
+
llm_results = llm_classifier.classify(low_confidence_texts, category_list)
|
| 139 |
+
for idx, llm_result in zip(low_confidence_indices, llm_results):
|
| 140 |
+
results[idx] = llm_result
|
| 141 |
|
| 142 |
# Create results dataframe
|
| 143 |
result_df = df.copy()
|
|
|
|
| 370 |
def show_results(df, validation_report):
|
| 371 |
"""Show the results after processing"""
|
| 372 |
if df is None:
|
| 373 |
+
return gr.Row(visible=False), gr.File(visible=False), gr.File(visible=False), gr.Dataframe(visible=False)
|
| 374 |
|
| 375 |
# Sort by category if it exists
|
| 376 |
if "Category" in df.columns:
|
|
|
|
| 380 |
csv_path = export_results(df, "csv")
|
| 381 |
excel_path = export_results(df, "excel")
|
| 382 |
|
| 383 |
+
return gr.Row(visible=True), gr.File(value=csv_path, visible=True), gr.File(value=excel_path, visible=True), gr.Dataframe(value=df, visible=True)
|
| 384 |
|
| 385 |
# Function to suggest a new category
|
| 386 |
def suggest_new_category(file, current_categories, text_columns):
|
classifiers.py
CHANGED
|
@@ -5,6 +5,8 @@ from sklearn.cluster import KMeans
|
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
import random
|
| 7 |
import json
|
|
|
|
|
|
|
| 8 |
|
| 9 |
class BaseClassifier:
|
| 10 |
"""Base class for text classifiers"""
|
|
@@ -143,21 +145,37 @@ class LLMClassifier(BaseClassifier):
|
|
| 143 |
self.client = client
|
| 144 |
self.model = model
|
| 145 |
|
| 146 |
-
def classify(self, texts, categories=None):
|
| 147 |
-
"""Classify texts using an LLM"""
|
| 148 |
if not categories:
|
| 149 |
# First, use LLM to generate appropriate categories
|
| 150 |
categories = self._suggest_categories(texts)
|
| 151 |
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
#
|
| 155 |
-
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
return results
|
| 159 |
|
| 160 |
-
def _suggest_categories(self, texts, sample_size=20):
|
| 161 |
"""Use LLM to suggest appropriate categories for the dataset"""
|
| 162 |
# Take a sample of texts to avoid token limitations
|
| 163 |
if len(texts) > sample_size:
|
|
@@ -192,7 +210,7 @@ class LLMClassifier(BaseClassifier):
|
|
| 192 |
print(f"Error suggesting categories: {str(e)}")
|
| 193 |
return self._generate_default_categories(texts)
|
| 194 |
|
| 195 |
-
def _classify_text(self, text, categories):
|
| 196 |
"""Use LLM to classify a single text"""
|
| 197 |
categories_str = ", ".join(categories)
|
| 198 |
|
|
|
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
import random
|
| 7 |
import json
|
| 8 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
|
| 11 |
class BaseClassifier:
|
| 12 |
"""Base class for text classifiers"""
|
|
|
|
| 145 |
self.client = client
|
| 146 |
self.model = model
|
| 147 |
|
| 148 |
+
def classify(self, texts: List[str], categories: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
| 149 |
+
"""Classify texts using an LLM with parallel processing"""
|
| 150 |
if not categories:
|
| 151 |
# First, use LLM to generate appropriate categories
|
| 152 |
categories = self._suggest_categories(texts)
|
| 153 |
|
| 154 |
+
# Process texts in parallel
|
| 155 |
+
with ThreadPoolExecutor(max_workers=10) as executor:
|
| 156 |
+
# Submit all tasks
|
| 157 |
+
future_to_text = {
|
| 158 |
+
executor.submit(self._classify_text, text, categories): text
|
| 159 |
+
for text in texts
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
# Collect results as they complete
|
| 163 |
+
results = []
|
| 164 |
+
for future in as_completed(future_to_text):
|
| 165 |
+
try:
|
| 166 |
+
result = future.result()
|
| 167 |
+
results.append(result)
|
| 168 |
+
except Exception as e:
|
| 169 |
+
print(f"Error processing text: {str(e)}")
|
| 170 |
+
results.append({
|
| 171 |
+
"category": categories[0],
|
| 172 |
+
"confidence": 50,
|
| 173 |
+
"explanation": f"Error during classification: {str(e)}"
|
| 174 |
+
})
|
| 175 |
|
| 176 |
return results
|
| 177 |
|
| 178 |
+
def _suggest_categories(self, texts: List[str], sample_size: int = 20) -> List[str]:
|
| 179 |
"""Use LLM to suggest appropriate categories for the dataset"""
|
| 180 |
# Take a sample of texts to avoid token limitations
|
| 181 |
if len(texts) > sample_size:
|
|
|
|
| 210 |
print(f"Error suggesting categories: {str(e)}")
|
| 211 |
return self._generate_default_categories(texts)
|
| 212 |
|
| 213 |
+
def _classify_text(self, text: str, categories: List[str]) -> Dict[str, Any]:
|
| 214 |
"""Use LLM to classify a single text"""
|
| 215 |
categories_str = ", ".join(categories)
|
| 216 |
|
utils.py
CHANGED
|
@@ -64,6 +64,16 @@ def visualize_results(df, text_column, category_column="Category"):
|
|
| 64 |
Returns:
|
| 65 |
matplotlib.figure.Figure: Visualization figure
|
| 66 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
# Get categories and their counts
|
| 68 |
category_counts = df[category_column].value_counts()
|
| 69 |
|
|
|
|
| 64 |
Returns:
|
| 65 |
matplotlib.figure.Figure: Visualization figure
|
| 66 |
"""
|
| 67 |
+
# Check if category column exists
|
| 68 |
+
if category_column not in df.columns:
|
| 69 |
+
# Create a simple figure with a message
|
| 70 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 71 |
+
ax.text(0.5, 0.5, "No categories to display",
|
| 72 |
+
ha='center', va='center', fontsize=12)
|
| 73 |
+
ax.set_title('No Classification Results Available')
|
| 74 |
+
plt.tight_layout()
|
| 75 |
+
return fig
|
| 76 |
+
|
| 77 |
# Get categories and their counts
|
| 78 |
category_counts = df[category_column].value_counts()
|
| 79 |
|