Spaces:
Runtime error
Runtime error
File size: 4,551 Bytes
eeb0f9c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
"""
Language Detector
Automatically detect user's language from their message
"""
import re
from typing import Optional, Tuple
from enum import Enum
class Language(str, Enum):
"""Supported languages"""
VIETNAMESE = "vi"
ENGLISH = "en"
class LanguageDetector:
"""Detect language from user input"""
# Vietnamese-specific characters
VIETNAMESE_CHARS = set('àáảãạăằắẳẵặâầấẩẫậèéẻẽẹêềếểễệìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵđ')
# Common Vietnamese words
VIETNAMESE_WORDS = {
'tôi', 'bạn', 'của', 'và', 'có', 'là', 'được', 'không', 'này', 'cho',
'với', 'đã', 'sẽ', 'để', 'trong', 'một', 'những', 'các', 'như', 'khi',
'muốn', 'cần', 'nên', 'thì', 'hay', 'hoặc', 'nhưng', 'mà', 'vì', 'nếu',
'giúp', 'giảm', 'tăng', 'ăn', 'uống', 'tập', 'làm', 'biết', 'hỏi', 'nói',
'cảm', 'thấy', 'đau', 'khỏe', 'bệnh', 'thuốc', 'bác', 'sĩ', 'viện'
}
# Common English words
ENGLISH_WORDS = {
'i', 'you', 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'have', 'has',
'do', 'does', 'can', 'will', 'would', 'should', 'could', 'my', 'your',
'want', 'need', 'help', 'how', 'what', 'when', 'where', 'why', 'who',
'eat', 'drink', 'exercise', 'weight', 'health', 'doctor', 'pain', 'feel'
}
@staticmethod
def detect(text: str) -> Language:
"""
Detect language from text
Args:
text: Input text
Returns:
Detected language (vi or en)
"""
if not text or len(text.strip()) < 2:
return Language.VIETNAMESE # Default
text_lower = text.lower()
# Check for Vietnamese characters
has_vietnamese_chars = any(char in LanguageDetector.VIETNAMESE_CHARS for char in text_lower)
if has_vietnamese_chars:
return Language.VIETNAMESE
# Check for Vietnamese words
words = re.findall(r'\b\w+\b', text_lower)
vietnamese_word_count = sum(1 for word in words if word in LanguageDetector.VIETNAMESE_WORDS)
english_word_count = sum(1 for word in words if word in LanguageDetector.ENGLISH_WORDS)
# If more Vietnamese words, it's Vietnamese
if vietnamese_word_count > english_word_count:
return Language.VIETNAMESE
# If more English words, it's English
if english_word_count > vietnamese_word_count:
return Language.ENGLISH
# Default to Vietnamese
return Language.VIETNAMESE
@staticmethod
def detect_with_confidence(text: str) -> Tuple[Language, float]:
"""
Detect language with confidence score
Args:
text: Input text
Returns:
(language, confidence_score)
"""
if not text or len(text.strip()) < 2:
return Language.VIETNAMESE, 0.5
text_lower = text.lower()
# Count Vietnamese characters
vietnamese_char_count = sum(1 for char in text_lower if char in LanguageDetector.VIETNAMESE_CHARS)
total_chars = len([c for c in text_lower if c.isalpha()])
if vietnamese_char_count > 0 and total_chars > 0:
confidence = min(vietnamese_char_count / total_chars * 2, 1.0)
return Language.VIETNAMESE, confidence
# Count words
words = re.findall(r'\b\w+\b', text_lower)
if not words:
return Language.VIETNAMESE, 0.5
vietnamese_word_count = sum(1 for word in words if word in LanguageDetector.VIETNAMESE_WORDS)
english_word_count = sum(1 for word in words if word in LanguageDetector.ENGLISH_WORDS)
total_matched = vietnamese_word_count + english_word_count
if total_matched == 0:
return Language.VIETNAMESE, 0.5
if vietnamese_word_count > english_word_count:
confidence = vietnamese_word_count / total_matched
return Language.VIETNAMESE, confidence
else:
confidence = english_word_count / total_matched
return Language.ENGLISH, confidence
def detect_language(text: str) -> Language:
"""Convenience function to detect language"""
return LanguageDetector.detect(text)
|