File size: 4,551 Bytes
eeb0f9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Language Detector
Automatically detect user's language from their message
"""

import re
from typing import Optional, Tuple
from enum import Enum


class Language(str, Enum):
    """Supported languages"""
    VIETNAMESE = "vi"
    ENGLISH = "en"


class LanguageDetector:
    """Detect language from user input"""
    
    # Vietnamese-specific characters
    VIETNAMESE_CHARS = set('àáảãạăằắẳẵặâầấẩẫậèéẻẽẹêềếểễệìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵđ')
    
    # Common Vietnamese words
    VIETNAMESE_WORDS = {
        'tôi', 'bạn', 'của', 'và', 'có', 'là', 'được', 'không', 'này', 'cho',
        'với', 'đã', 'sẽ', 'để', 'trong', 'một', 'những', 'các', 'như', 'khi',
        'muốn', 'cần', 'nên', 'thì', 'hay', 'hoặc', 'nhưng', 'mà', 'vì', 'nếu',
        'giúp', 'giảm', 'tăng', 'ăn', 'uống', 'tập', 'làm', 'biết', 'hỏi', 'nói',
        'cảm', 'thấy', 'đau', 'khỏe', 'bệnh', 'thuốc', 'bác', 'sĩ', 'viện'
    }
    
    # Common English words
    ENGLISH_WORDS = {
        'i', 'you', 'the', 'a', 'an', 'is', 'are', 'was', 'were', 'have', 'has',
        'do', 'does', 'can', 'will', 'would', 'should', 'could', 'my', 'your',
        'want', 'need', 'help', 'how', 'what', 'when', 'where', 'why', 'who',
        'eat', 'drink', 'exercise', 'weight', 'health', 'doctor', 'pain', 'feel'
    }
    
    @staticmethod
    def detect(text: str) -> Language:
        """
        Detect language from text
        
        Args:
            text: Input text
            
        Returns:
            Detected language (vi or en)
        """
        if not text or len(text.strip()) < 2:
            return Language.VIETNAMESE  # Default
        
        text_lower = text.lower()
        
        # Check for Vietnamese characters
        has_vietnamese_chars = any(char in LanguageDetector.VIETNAMESE_CHARS for char in text_lower)
        
        if has_vietnamese_chars:
            return Language.VIETNAMESE
        
        # Check for Vietnamese words
        words = re.findall(r'\b\w+\b', text_lower)
        vietnamese_word_count = sum(1 for word in words if word in LanguageDetector.VIETNAMESE_WORDS)
        english_word_count = sum(1 for word in words if word in LanguageDetector.ENGLISH_WORDS)
        
        # If more Vietnamese words, it's Vietnamese
        if vietnamese_word_count > english_word_count:
            return Language.VIETNAMESE
        
        # If more English words, it's English
        if english_word_count > vietnamese_word_count:
            return Language.ENGLISH
        
        # Default to Vietnamese
        return Language.VIETNAMESE
    
    @staticmethod
    def detect_with_confidence(text: str) -> Tuple[Language, float]:
        """
        Detect language with confidence score
        
        Args:
            text: Input text
            
        Returns:
            (language, confidence_score)
        """
        if not text or len(text.strip()) < 2:
            return Language.VIETNAMESE, 0.5
        
        text_lower = text.lower()
        
        # Count Vietnamese characters
        vietnamese_char_count = sum(1 for char in text_lower if char in LanguageDetector.VIETNAMESE_CHARS)
        total_chars = len([c for c in text_lower if c.isalpha()])
        
        if vietnamese_char_count > 0 and total_chars > 0:
            confidence = min(vietnamese_char_count / total_chars * 2, 1.0)
            return Language.VIETNAMESE, confidence
        
        # Count words
        words = re.findall(r'\b\w+\b', text_lower)
        if not words:
            return Language.VIETNAMESE, 0.5
        
        vietnamese_word_count = sum(1 for word in words if word in LanguageDetector.VIETNAMESE_WORDS)
        english_word_count = sum(1 for word in words if word in LanguageDetector.ENGLISH_WORDS)
        
        total_matched = vietnamese_word_count + english_word_count
        
        if total_matched == 0:
            return Language.VIETNAMESE, 0.5
        
        if vietnamese_word_count > english_word_count:
            confidence = vietnamese_word_count / total_matched
            return Language.VIETNAMESE, confidence
        else:
            confidence = english_word_count / total_matched
            return Language.ENGLISH, confidence


def detect_language(text: str) -> Language:
    """Convenience function to detect language"""
    return LanguageDetector.detect(text)