Spaces:
Running
Running
File size: 7,877 Bytes
1ee396e f2d9b6e 56943b3 abc6868 1ee396e 4784f3b 1ee396e 4784f3b 1ee396e abc6868 1ee396e abc6868 1f2982b 1ee396e 1f2982b a4274f6 1f2982b 97e3d11 5fc20da 97e3d11 1f2982b 97e3d11 5fc20da 97e3d11 1f2982b 595c35f 1f2982b 8d5fa5e 97e3d11 1f2982b 8d5fa5e ffdaf59 41fcaae 8751493 595c35f 1f2982b 1ee396e 1f2982b a4274f6 1ee396e 08efe9a 1ee396e a4274f6 1ee396e 4dc7b7d 1b679c3 1ee396e 614dfc7 9b52dc7 1b679c3 fb6f7ac 1b679c3 9b52dc7 f2d9b6e fb6f7ac 9b52dc7 1ee396e f2d9b6e 1ee396e 8751493 1ee396e abc6868 31e0f17 abc6868 8751493 abc6868 31e0f17 7da3ef8 1ee396e 378b616 1ee396e 31e0f17 1ee396e d2137e3 31e0f17 d2137e3 31e0f17 6acad0f 31e0f17 d2137e3 4083407 d2137e3 31e0f17 1ee396e abc6868 1ee396e 4dc7b7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import os
# Redirect cache to a writable path inside container
os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"
import gradio as gr
from impresso_pipelines.ocrqa import OCRQAPipeline
pipeline = OCRQAPipeline()
CSS = """
#ocr_output textarea {
font-family: monospace !important;
}
"""
LANGUAGES = ["en", "de", "fr","lb"]
# Example OCR text (German text with typical OCR errors)
EXAMPLE_TEXT = """Vieles Seltsame geschieht auf Erden :
Nichts Seltsameres sieht der Mond
Als das Glück, das im Knopfloch wohnt.
Zaubrisch faßt es den ernsten Mann.
Ohne nach Weib u. Kinjd zu fragen
Reitet er aus, nach dem Glück zu jagen,
Nur nacb ihm war stets sein Vegehr.
Aber neben ihm 1reitet der Dämon her
Des Ehrgeizes mit finsterer Tücke,
Und so jagt er zuletzt auf die Brücke,
Die über dem Abgrund, d:m nächtlich schwarzen
Jählings abbricht."""
def process_ocr_qa(text, lang_choice):
try:
lang = None if lang_choice == "Auto-detect" else lang_choice
result = pipeline(text, language=lang, diagnostics=True)
# Format the output for better readability
if isinstance(result, dict):
output_lines = []
# Language detection
if 'language' in result:
output_lines.append(f"🌍 Language: {result['language']}")
# Quality score (with ratio)
score = result.get("score")
diagnostics = result.get("diagnostics", {})
known_tokens = diagnostics.get("known_tokens", [])
unknown_tokens = diagnostics.get("unknown_tokens", [])
if score is not None:
k = len(known_tokens)
u = len(unknown_tokens)
total = k + u
ratio = f"{k}/{total}" if total > 0 else "n/a"
score_emoji = "🟢" if score >= 0.8 else "🟡" if score >= 0.5 else "🔴"
output_lines.append(
f"{score_emoji} Quality Score: {score if total else 'n/a'} ({ratio})"
)
# Diagnostics section
if 'diagnostics' in result and result['diagnostics']:
diagnostics = result['diagnostics']
# Known tokens
if 'known_tokens' in diagnostics and diagnostics['known_tokens']:
known_tokens = diagnostics['known_tokens']
output_lines.append(f"✅ Known unique tokens ({k}):\n{' '.join(known_tokens)}")
# Unknown tokens (potential OCR errors)
if 'unknown_tokens' in diagnostics and diagnostics['unknown_tokens']:
unknown_tokens = diagnostics['unknown_tokens']
output_lines.append(f"❌ Unrecognized unique tokens ({u}):\n{' '.join(unknown_tokens)}")
elif 'unknown_tokens' in diagnostics and total > 0:
output_lines.append("✨ All tokens were known – no OCR errors detected.")
# Model information
if 'model_id' in diagnostics:
output_lines.append(f"🤖 Model: {diagnostics['model_id']}")
# Other fields
for key, value in result.items():
if key not in ['language', 'score', 'diagnostics']:
output_lines.append(f"🔍 {key.replace('_', ' ').title()}: {value}")
return "\n\n".join(output_lines)
else:
return f"✨ Processed Result:\n{result}"
except Exception as e:
print("❌ Pipeline error:", e)
return f"Error: {e}"
# Create the interface with logo and improved description
with gr.Blocks(title="OCR QA Demo") as demo:
gr.Markdown(
"""
# 🔍 Optical Character Recognition (OCR) Quality Assessment Demo
This demo evaluates OCR quality by comparing the unique words in a text against large reference vocabularies.
It reports:
- **potential OCR errors** (unrecognized unique tokens) and known tokens
- an overall **quality score** between 0.0 (poor) and 1.0 (perfect), defined as `score = known/(known + unrecognized)`
Try the German example below or paste your own OCR text.
"""
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="OCR input text",
value=EXAMPLE_TEXT,
lines=8,
placeholder="Paste OCR text..."
)
lang_dropdown = gr.Dropdown(
choices=LANGUAGES,
value="de",
label="Language of the input text"
)
submit_btn = gr.Button("🔍 Assess OCR Text Quality", variant="primary")
info_btn = gr.Button("Help", size="md", scale=1)
with gr.Column():
output = gr.Textbox(
label="OCR Quality Assessment",
lines=15,
placeholder="The quality assessment will be shown here...",
scale=10,
elem_id="ocr_output"
)
gr.HTML(
"""
<a href="https://impresso-project.ch" target="_blank">
<img src="https://huggingface.co/spaces/impresso-project/ocrqa-demo/resolve/main/logo.jpeg"
alt="Impresso Project Logo"
style="height: 42px; display: block; margin: 5px auto; background-color: white;">
</a>
"""
)
# Info modal/accordion for pipeline details
with gr.Accordion("📝 About the OCR QA Method", open=False, visible=False) as info_accordion:
gr.Markdown(
"""
This pipeline estimates OCR quality by analyzing the proportion of **unique words** in a text that match curated wordlists for a given language.
#### How it works:
- **Scoring**: The quality score ranges from **0.0** (poor) to **1.0** (excellent) and is based on the ratio of recognized to unrecognized unique word forms.
- **Lexical resources**: Words are matched against precompiled lists derived from **Wikipedia** and **Wortschatz Leipzig**, using **Bloom filters** for fast, memory-efficient lookup.
- **Multilingual support**: Available for several languages (e.g., German, French, English). If not specified, the language is detected automatically.
- **Diagnostics output**:
- ✅ **Known tokens**: Words found in the reference wordlist, presumed correctly OCR’d.
- ❌ **Unrecognized tokens**: Words not found in the list—often OCR errors, rare forms, or out-of-vocabulary items (e.g., names, historical terms).
- Note: Non-alphabetic characters will be removed. For efficiency reasons, all digits are replace by the digit 0.
#### ⚠️ Limitations:
- The lists of known words are **not exhaustive**, particularly for **historical vocabulary**, **Luxembourgish**, or **named entities**.
- The method may fail to flag **short OCR artifacts** (e.g., 1–3 character noise).
As such, the score should be understood as a **heuristic indicator**, best used for:
- Comparative assessments between OCR outputs
- Filtering low-quality text from large corpora
- Supporting decisions in corpus preparation and annotation workflows
It is **not a substitute for manual inspection** or ground-truth evaluation.
"""
)
submit_btn.click(
fn=process_ocr_qa,
inputs=[text_input, lang_dropdown],
outputs=output
)
# Toggle info visibility when info button is clicked
info_btn.click(
fn=lambda: gr.Accordion(visible=True, open=True),
outputs=info_accordion
)
demo.launch(server_name="0.0.0.0", server_port=7860, css=CSS) |