File size: 7,877 Bytes
1ee396e
 
 
 
 
 
 
 
 
 
f2d9b6e
 
 
 
 
 
 
56943b3
abc6868
1ee396e
 
 
 
 
4784f3b
1ee396e
 
4784f3b
1ee396e
 
 
 
 
abc6868
1ee396e
abc6868
 
1f2982b
1ee396e
 
 
1f2982b
 
 
a4274f6
1f2982b
97e3d11
 
 
 
 
 
 
 
 
 
5fc20da
97e3d11
1f2982b
97e3d11
5fc20da
97e3d11
 
1f2982b
 
 
 
595c35f
1f2982b
 
8d5fa5e
97e3d11
1f2982b
 
 
8d5fa5e
ffdaf59
41fcaae
8751493
595c35f
 
 
 
 
1f2982b
1ee396e
1f2982b
a4274f6
1ee396e
08efe9a
1ee396e
a4274f6
1ee396e
 
 
 
 
 
4dc7b7d
1b679c3
1ee396e
 
614dfc7
9b52dc7
1b679c3
 
fb6f7ac
 
1b679c3
9b52dc7
f2d9b6e
fb6f7ac
9b52dc7
1ee396e
 
 
 
 
f2d9b6e
1ee396e
 
8751493
1ee396e
abc6868
31e0f17
abc6868
8751493
abc6868
31e0f17
7da3ef8
1ee396e
 
378b616
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ee396e
 
31e0f17
1ee396e
d2137e3
 
31e0f17
d2137e3
31e0f17
 
 
 
 
 
6acad0f
31e0f17
d2137e3
4083407
 
d2137e3
31e0f17
 
 
 
 
 
 
 
1ee396e
 
 
abc6868
1ee396e
 
 
 
 
 
 
 
 
4dc7b7d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import os

# Redirect cache to a writable path inside container
os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"

import gradio as gr
from impresso_pipelines.ocrqa import OCRQAPipeline

pipeline = OCRQAPipeline()

CSS = """
#ocr_output textarea {
    font-family: monospace !important;
}
"""


LANGUAGES = ["en", "de", "fr","lb"]

# Example OCR text (German text with typical OCR errors)
EXAMPLE_TEXT = """Vieles Seltsame geschieht auf Erden :
Nichts Seltsameres sieht der Mond
Als das Glück, das im Knopfloch wohnt.
Zaubrisch faßt es den ernsten Mann.
Ohne nach Weib u. Kinjd zu fragen
Reitet er aus, nach dem Glück zu jagen,
Nur nacb ihm war stets sein Vegehr.
Aber neben ihm 1reitet der Dämon her
Des Ehrgeizes mit finsterer Tücke,
Und so jagt er zuletzt auf die Brücke,
Die über dem Abgrund, d:m nächtlich schwarzen
Jählings abbricht."""

def process_ocr_qa(text, lang_choice):
    try:
        lang = None if lang_choice == "Auto-detect" else lang_choice
        result = pipeline(text, language=lang, diagnostics=True)

        # Format the output for better readability
        if isinstance(result, dict):
            output_lines = []
            
            # Language detection
            if 'language' in result:
                output_lines.append(f"🌍 Language: {result['language']}")
            
            # Quality score (with ratio)
            score = result.get("score")
            diagnostics = result.get("diagnostics", {})
            known_tokens = diagnostics.get("known_tokens", [])
            unknown_tokens = diagnostics.get("unknown_tokens", [])
            
            if score is not None:
                k = len(known_tokens)
                u = len(unknown_tokens)
                total = k + u
                ratio = f"{k}/{total}" if total > 0 else "n/a"
            
                score_emoji = "🟢" if score >= 0.8 else "🟡" if score >= 0.5 else "🔴"
                output_lines.append(
                    f"{score_emoji} Quality Score: {score if total else 'n/a'}  ({ratio})"
                )

            
            # Diagnostics section
            if 'diagnostics' in result and result['diagnostics']:
                diagnostics = result['diagnostics']
                                
                # Known tokens
                if 'known_tokens' in diagnostics and diagnostics['known_tokens']:
                    known_tokens = diagnostics['known_tokens']
                    output_lines.append(f"✅ Known unique tokens ({k}):\n{' '.join(known_tokens)}")
                
                # Unknown tokens (potential OCR errors)
                if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']:
                    unknown_tokens = diagnostics['unknown_tokens']
                    output_lines.append(f"❌ Unrecognized unique tokens ({u}):\n{' '.join(unknown_tokens)}")
                elif 'unknown_tokens' in diagnostics and total > 0:
                    output_lines.append("✨ All tokens were known – no OCR errors detected.")
                
                # Model information
                if 'model_id' in diagnostics:
                    output_lines.append(f"🤖 Model: {diagnostics['model_id']}")

            # Other fields
            for key, value in result.items():
                if key not in ['language', 'score', 'diagnostics']:
                    output_lines.append(f"🔍 {key.replace('_', ' ').title()}: {value}")
            
            return "\n\n".join(output_lines)
        else:
            return f"✨ Processed Result:\n{result}"
            
    except Exception as e:
        print("❌ Pipeline error:", e)
        return f"Error: {e}"

# Create the interface with logo and improved description
with gr.Blocks(title="OCR QA Demo") as demo:

    gr.Markdown(
        """
    # 🔍 Optical Character Recognition (OCR) Quality Assessment Demo

    This demo evaluates OCR quality by comparing the unique words in a text against large reference vocabularies.
    
    It reports:
    - **potential OCR errors** (unrecognized unique tokens) and known tokens
    - an overall **quality score** between 0.0 (poor) and 1.0 (perfect), defined as `score = known/(known + unrecognized)`


    Try the German example below or paste your own OCR text.
    """
    )
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="OCR input text", 
                value=EXAMPLE_TEXT,
                lines=8,
                placeholder="Paste OCR text..."
            )
            lang_dropdown = gr.Dropdown(
                choices=LANGUAGES, 
                value="de", 
                label="Language of the input text"
            )
            submit_btn = gr.Button("🔍 Assess OCR Text Quality", variant="primary")
            info_btn = gr.Button("Help", size="md", scale=1)
        
        with gr.Column():
            output = gr.Textbox(
                label="OCR Quality Assessment", 
                lines=15,
                placeholder="The quality assessment will be shown here...",
                scale=10,
                elem_id="ocr_output"
                
            )
            gr.HTML(
                """
                <a href="https://impresso-project.ch" target="_blank">
                    <img src="https://huggingface.co/spaces/impresso-project/ocrqa-demo/resolve/main/logo.jpeg" 
                         alt="Impresso Project Logo" 
                         style="height: 42px; display: block; margin: 5px auto; background-color: white;">
                </a>
                """
            )
    
    # Info modal/accordion for pipeline details
    with gr.Accordion("📝 About the OCR QA Method", open=False, visible=False) as info_accordion:
        gr.Markdown(
    """
    This pipeline estimates OCR quality by analyzing the proportion of **unique words** in a text that match curated wordlists for a given language.

    #### How it works:
    - **Scoring**: The quality score ranges from **0.0** (poor) to **1.0** (excellent) and is based on the ratio of recognized to unrecognized unique word forms.
    - **Lexical resources**: Words are matched against precompiled lists derived from **Wikipedia** and **Wortschatz Leipzig**, using **Bloom filters** for fast, memory-efficient lookup.
    - **Multilingual support**: Available for several languages (e.g., German, French, English). If not specified, the language is detected automatically.
    - **Diagnostics output**:
        - ✅ **Known tokens**: Words found in the reference wordlist, presumed correctly OCR’d.
        - ❌ **Unrecognized tokens**: Words not found in the list—often OCR errors, rare forms, or out-of-vocabulary items (e.g., names, historical terms).
        - Note: Non-alphabetic characters will be removed. For efficiency reasons, all digits are replace by the digit 0.

    #### ⚠️ Limitations:
    - The lists of known words are **not exhaustive**, particularly for **historical vocabulary**, **Luxembourgish**, or **named entities**.
    - The method may fail to flag **short OCR artifacts** (e.g., 1–3 character noise).

    As such, the score should be understood as a **heuristic indicator**, best used for:
    - Comparative assessments between OCR outputs
    - Filtering low-quality text from large corpora
    - Supporting decisions in corpus preparation and annotation workflows

    It is **not a substitute for manual inspection** or ground-truth evaluation.
    """
)
    
    submit_btn.click(
        fn=process_ocr_qa,
        inputs=[text_input, lang_dropdown],
        outputs=output
    )
    
    # Toggle info visibility when info button is clicked
    info_btn.click(
        fn=lambda: gr.Accordion(visible=True, open=True),
        outputs=info_accordion
    )

demo.launch(server_name="0.0.0.0", server_port=7860, css=CSS)