Spaces:

lapnt3
/

my-gradio-app

Runtime error

File size: 4,551 Bytes

eeb0f9c

"""
Language Detector
Automatically detect user's language from their message
"""

import re
from typing import Optional, Tuple
from enum import Enum


class Language(str, Enum):
    """Supported languages"""
    VIETNAMESE = "vi"
    ENGLISH = "en"


class LanguageDetector:
    """Detect language from user input"""
    
    # Vietnamese-specific characters
    VIETNAMESE_CHARS = set('àáảãạăằắẳẵặâầấẩẫậèéẻẽẹêềếểễệìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵđ')
    
    # Common Vietnamese words
    VIETNAMESE_WORDS = {
        'tôi', 'bạn', 'của', 'và', 'có', 'là', 'được', 'không', 'này', 'cho',
        'với', 'đã', 'sẽ', 'để', 'trong', 'một', 'những', 'các', 'như', 'khi',
        'muốn', 'cần', 'nên', 'thì', 'hay', 'hoặc', 'nhưng', 'mà', 'vì', 'nếu',
        'giúp', 'giảm', 'tăng', 'ăn', 'uống', 'tập', 'làm', 'biết', 'hỏi', 'nói',
        'cảm', 'thấy', 'đau', 'khỏe', 'bệnh', 'thuốc', 'bác', 'sĩ', 'viện'
    }
    
    # Common English words
    ENGLISH_WORDS = {
        'i', 'you', 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'have', 'has',
        'do', 'does', 'can', 'will', 'would', 'should', 'could', 'my', 'your',
        'want', 'need', 'help', 'how', 'what', 'when', 'where', 'why', 'who',
        'eat', 'drink', 'exercise', 'weight', 'health', 'doctor', 'pain', 'feel'
    }
    
    @staticmethod
    def detect(text: str) -> Language:
        """
        Detect language from text
        
        Args:
            text: Input text
            
        Returns:
            Detected language (vi or en)
        """
        if not text or len(text.strip()) < 2:
            return Language.VIETNAMESE  # Default
        
        text_lower = text.lower()
        
        # Check for Vietnamese characters
        has_vietnamese_chars = any(char in LanguageDetector.VIETNAMESE_CHARS for char in text_lower)
        
        if has_vietnamese_chars:
            return Language.VIETNAMESE
        
        # Check for Vietnamese words
        words = re.findall(r'\b\w+\b', text_lower)
        vietnamese_word_count = sum(1 for word in words if word in LanguageDetector.VIETNAMESE_WORDS)
        english_word_count = sum(1 for word in words if word in LanguageDetector.ENGLISH_WORDS)
        
        # If more Vietnamese words, it's Vietnamese
        if vietnamese_word_count > english_word_count:
            return Language.VIETNAMESE
        
        # If more English words, it's English
        if english_word_count > vietnamese_word_count:
            return Language.ENGLISH
        
        # Default to Vietnamese
        return Language.VIETNAMESE
    
    @staticmethod
    def detect_with_confidence(text: str) -> Tuple[Language, float]:
        """
        Detect language with confidence score
        
        Args:
            text: Input text
            
        Returns:
            (language, confidence_score)
        """
        if not text or len(text.strip()) < 2:
            return Language.VIETNAMESE, 0.5
        
        text_lower = text.lower()
        
        # Count Vietnamese characters
        vietnamese_char_count = sum(1 for char in text_lower if char in LanguageDetector.VIETNAMESE_CHARS)
        total_chars = len([c for c in text_lower if c.isalpha()])
        
        if vietnamese_char_count > 0 and total_chars > 0:
            confidence = min(vietnamese_char_count / total_chars * 2, 1.0)
            return Language.VIETNAMESE, confidence
        
        # Count words
        words = re.findall(r'\b\w+\b', text_lower)
        if not words:
            return Language.VIETNAMESE, 0.5
        
        vietnamese_word_count = sum(1 for word in words if word in LanguageDetector.VIETNAMESE_WORDS)
        english_word_count = sum(1 for word in words if word in LanguageDetector.ENGLISH_WORDS)
        
        total_matched = vietnamese_word_count + english_word_count
        
        if total_matched == 0:
            return Language.VIETNAMESE, 0.5
        
        if vietnamese_word_count > english_word_count:
            confidence = vietnamese_word_count / total_matched
            return Language.VIETNAMESE, confidence
        else:
            confidence = english_word_count / total_matched
            return Language.ENGLISH, confidence


def detect_language(text: str) -> Language:
    """Convenience function to detect language"""
    return LanguageDetector.detect(text)