Spaces:

garyd1
/

text_translator

Sleeping

App Files Files Community

garyd1 commited on Feb 25

Commit

57ec4e3

verified ·

1 Parent(s): 8953790

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -22

app.py CHANGED Viewed

@@ -5,12 +5,14 @@ import streamlit as st
 import pandas as pd
 import torch
 import nltk
 from langchain.chat_models import ChatOpenAI
 from langchain.schema import SystemMessage, HumanMessage
 from sentence_transformers import SentenceTransformer, util
-# Try to load spaCy for advanced NLP processing
 try:
     import spacy
     nlp = spacy.load("en_core_web_sm")
@@ -26,7 +28,7 @@ model = SentenceTransformer('all-MiniLM-L6-v2')
 @st.cache_data
 def load_glossary_from_excel(glossary_file_bytes) -> dict:
-    """Load glossary from an Excel file, applying lemmatization and sorting by length."""
     df = pd.read_excel(glossary_file_bytes)
     glossary = {}
@@ -48,37 +50,52 @@ def compute_glossary_embeddings_cached(glossary_items: tuple):
     embeddings = model.encode(glossary_terms, convert_to_tensor=True)
     return glossary_terms, embeddings
-def translate_text(text: str) -> str:
-    """Uses OpenAI's GPT to translate text to Canadian French."""
-    messages = [
-        SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and context."),
-        HumanMessage(content=text)
-    ]
-    response = translator(messages)
-    return response.content.strip()
 def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
-    """Applies glossary replacements based on semantic similarity."""
     glossary_items = tuple(sorted(glossary.items()))
     glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
     sentences = nltk.tokenize.sent_tokenize(text) if not use_spacy else [sent.text for sent in nlp(text).sents]
-    updated_sentences = []
-    for sentence in sentences:
         if not sentence.strip():
-            continue
         sentence_embedding = model.encode(sentence, convert_to_tensor=True)
         cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
         max_score, max_idx = torch.max(cos_scores, dim=1)
-        if max_score.item() >= threshold:
             term = glossary_terms[max_idx]
             replacement = glossary[term]
             pattern = r'\b' + re.escape(term) + r'\b'
             sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
-        updated_sentences.append(sentence.strip())
     return " ".join(updated_sentences)
@@ -91,9 +108,18 @@ def validate_translation(original_text, final_text):
     response = translator(messages)
     return response.content.strip()
 # Streamlit UI
-st.title("AI-Powered English to Canadian French Translator")
-st.write("This app uses AI agents for translation, glossary enforcement, and meaning validation.")
 input_text = st.text_area("Enter text to translate:")
 glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
@@ -106,12 +132,13 @@ if st.button("Translate"):
         st.error("Glossary file is required.")
     else:
         glossary = load_glossary_from_excel(glossary_file)
-        translated_text = translate_text(input_text)
         glossary_enforced_text = enforce_glossary(translated_text, glossary, threshold)
-        validation_result = validate_translation(input_text, glossary_enforced_text)
         st.subheader("Final Translated Text:")
-        st.write(glossary_enforced_text)
         st.subheader("Validation Check:")
         st.write(validation_result)

 import pandas as pd
 import torch
 import nltk
+import time
+from concurrent.futures import ThreadPoolExecutor
 from langchain.chat_models import ChatOpenAI
 from langchain.schema import SystemMessage, HumanMessage
 from sentence_transformers import SentenceTransformer, util
+# Load NLP libraries
 try:
     import spacy
     nlp = spacy.load("en_core_web_sm")
 @st.cache_data
 def load_glossary_from_excel(glossary_file_bytes) -> dict:
+    """Load glossary from an Excel file, apply lemmatization, and sort by length."""
     df = pd.read_excel(glossary_file_bytes)
     glossary = {}
     embeddings = model.encode(glossary_terms, convert_to_tensor=True)
     return glossary_terms, embeddings
+def retry_translate_text(text: str, max_retries=3) -> str:
+    """Retries translation in case of API failure."""
+    for attempt in range(max_retries):
+        try:
+            messages = [
+                SystemMessage(content="You are a professional translator. Translate the following text to Canadian French while preserving its meaning and context."),
+                HumanMessage(content=text)
+            ]
+            response = translator(messages)
+            return response.content.strip()
+        except Exception as e:
+            print(f"Error in translation (attempt {attempt+1}): {e}")
+            time.sleep(2)  # Wait before retrying
+    return "Translation failed. Please try again later."
 def enforce_glossary(text: str, glossary: dict, threshold: float) -> str:
+    """Applies glossary replacements based on semantic similarity with batch processing."""
     glossary_items = tuple(sorted(glossary.items()))
     glossary_terms, glossary_embeddings = compute_glossary_embeddings_cached(glossary_items)
     sentences = nltk.tokenize.sent_tokenize(text) if not use_spacy else [sent.text for sent in nlp(text).sents]
+    def process_sentence(sentence):
+        """Processes a single sentence with glossary enforcement."""
         if not sentence.strip():
+            return sentence
+        # Dynamic threshold adjustment
+        sentence_length = len(sentence.split())
+        dynamic_threshold = 0.85 if sentence_length > 10 else 0.75  # Adjust threshold based on sentence length
         sentence_embedding = model.encode(sentence, convert_to_tensor=True)
         cos_scores = util.pytorch_cos_sim(sentence_embedding, glossary_embeddings)
         max_score, max_idx = torch.max(cos_scores, dim=1)
+        if max_score.item() >= dynamic_threshold:
             term = glossary_terms[max_idx]
             replacement = glossary[term]
             pattern = r'\b' + re.escape(term) + r'\b'
             sentence = re.sub(pattern, replacement, sentence, flags=re.IGNORECASE)
+        return sentence.strip()
+    # Process sentences in parallel for speed
+    with ThreadPoolExecutor() as executor:
+        updated_sentences = list(executor.map(process_sentence, sentences))
     return " ".join(updated_sentences)
     response = translator(messages)
     return response.content.strip()
+def grammar_correction(text: str) -> str:
+    """Uses GPT to fix grammar issues in the final translated text."""
+    messages = [
+        SystemMessage(content="You are a French grammar expert. Correct any grammatical mistakes in the following text."),
+        HumanMessage(content=text)
+    ]
+    response = translator(messages)
+    return response.content.strip()
 # Streamlit UI
+st.title("Optimized AI-Powered English to Canadian French Translator")
+st.write("This version includes retries, batch processing, glossary tuning, and grammar correction.")
 input_text = st.text_area("Enter text to translate:")
 glossary_file = st.file_uploader("Upload Glossary File (Excel)", type=["xlsx"])
         st.error("Glossary file is required.")
     else:
         glossary = load_glossary_from_excel(glossary_file)
+        translated_text = retry_translate_text(input_text)
         glossary_enforced_text = enforce_glossary(translated_text, glossary, threshold)
+        corrected_text = grammar_correction(glossary_enforced_text)
+        validation_result = validate_translation(input_text, corrected_text)
         st.subheader("Final Translated Text:")
+        st.write(corrected_text)
         st.subheader("Validation Check:")
         st.write(validation_result)