Spaces:
Runtime error
Runtime error
ugmSorcero
commited on
Commit
·
46323da
1
Parent(s):
753ae25
Adds image to text and tesseract linux dependencies
Browse files- interface/components.py +11 -1
- interface/pages.py +0 -1
- interface/utils.py +7 -2
- packages.txt +1 -0
- requirements.txt +2 -1
interface/components.py
CHANGED
|
@@ -80,6 +80,11 @@ def component_article_url(container):
|
|
| 80 |
st.markdown("---")
|
| 81 |
else:
|
| 82 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
corpus = [
|
| 84 |
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
|
| 85 |
]
|
|
@@ -93,7 +98,7 @@ def component_file_input(container):
|
|
| 93 |
doc_id = 1
|
| 94 |
with st.expander("Enter Files"):
|
| 95 |
while True:
|
| 96 |
-
file = st.file_uploader("Upload a .txt, .pdf, .csv file", key=doc_id)
|
| 97 |
if file != None:
|
| 98 |
extracted_text = extract_text_from_file(file)
|
| 99 |
if extracted_text != None:
|
|
@@ -104,6 +109,11 @@ def component_file_input(container):
|
|
| 104 |
break
|
| 105 |
else:
|
| 106 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
corpus = [
|
| 108 |
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
|
| 109 |
]
|
|
|
|
| 80 |
st.markdown("---")
|
| 81 |
else:
|
| 82 |
break
|
| 83 |
+
|
| 84 |
+
for idx, doc in enumerate(urls):
|
| 85 |
+
with st.expander(f"Preview URL {idx}"):
|
| 86 |
+
st.write(doc)
|
| 87 |
+
|
| 88 |
corpus = [
|
| 89 |
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
|
| 90 |
]
|
|
|
|
| 98 |
doc_id = 1
|
| 99 |
with st.expander("Enter Files"):
|
| 100 |
while True:
|
| 101 |
+
file = st.file_uploader("Upload a .txt, .pdf, .csv, image file", key=doc_id)
|
| 102 |
if file != None:
|
| 103 |
extracted_text = extract_text_from_file(file)
|
| 104 |
if extracted_text != None:
|
|
|
|
| 109 |
break
|
| 110 |
else:
|
| 111 |
break
|
| 112 |
+
|
| 113 |
+
for idx, doc in enumerate(files):
|
| 114 |
+
with st.expander(f"Preview File {idx}"):
|
| 115 |
+
st.write(doc)
|
| 116 |
+
|
| 117 |
corpus = [
|
| 118 |
{"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
|
| 119 |
]
|
interface/pages.py
CHANGED
|
@@ -26,7 +26,6 @@ def page_landing_page(container):
|
|
| 26 |
st.markdown(
|
| 27 |
"TODO list:"
|
| 28 |
"\n - Build other pipelines"
|
| 29 |
-
"\n - Include file/url indexing"
|
| 30 |
"\n - [Optional] Include text to audio to read responses"
|
| 31 |
)
|
| 32 |
|
|
|
|
| 26 |
st.markdown(
|
| 27 |
"TODO list:"
|
| 28 |
"\n - Build other pipelines"
|
|
|
|
| 29 |
"\n - [Optional] Include text to audio to read responses"
|
| 30 |
)
|
| 31 |
|
interface/utils.py
CHANGED
|
@@ -5,7 +5,8 @@ from newspaper import Article
|
|
| 5 |
from PyPDF2 import PdfFileReader
|
| 6 |
import streamlit as st
|
| 7 |
import pandas as pd
|
| 8 |
-
|
|
|
|
| 9 |
|
| 10 |
def get_pipelines():
|
| 11 |
pipeline_names, pipeline_funcs = list(
|
|
@@ -25,7 +26,7 @@ def extract_text_from_url(url: str):
|
|
| 25 |
|
| 26 |
return article.text
|
| 27 |
|
| 28 |
-
|
| 29 |
def extract_text_from_file(file):
|
| 30 |
# read text file
|
| 31 |
if file.type == "text/plain":
|
|
@@ -76,6 +77,10 @@ def extract_text_from_file(file):
|
|
| 76 |
continue
|
| 77 |
file_text += " " + txt
|
| 78 |
return file_text
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
else:
|
| 81 |
st.warning(f"File type {file.type} not supported")
|
|
|
|
| 5 |
from PyPDF2 import PdfFileReader
|
| 6 |
import streamlit as st
|
| 7 |
import pandas as pd
|
| 8 |
+
import pytesseract
|
| 9 |
+
from PIL import Image
|
| 10 |
|
| 11 |
def get_pipelines():
|
| 12 |
pipeline_names, pipeline_funcs = list(
|
|
|
|
| 26 |
|
| 27 |
return article.text
|
| 28 |
|
| 29 |
+
@st.experimental_memo
|
| 30 |
def extract_text_from_file(file):
|
| 31 |
# read text file
|
| 32 |
if file.type == "text/plain":
|
|
|
|
| 77 |
continue
|
| 78 |
file_text += " " + txt
|
| 79 |
return file_text
|
| 80 |
+
|
| 81 |
+
# read image file (OCR)
|
| 82 |
+
elif file.type == 'image/jpeg':
|
| 83 |
+
return pytesseract.image_to_string(Image.open(file))
|
| 84 |
|
| 85 |
else:
|
| 86 |
st.warning(f"File type {file.type} not supported")
|
packages.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
tesseract-ocr-all
|
requirements.txt
CHANGED
|
@@ -4,4 +4,5 @@ farm-haystack==1.8.0
|
|
| 4 |
black==22.8.0
|
| 5 |
plotly==5.10.0
|
| 6 |
newspaper3k==0.2.8
|
| 7 |
-
PyPDF2==2.10.7
|
|
|
|
|
|
| 4 |
black==22.8.0
|
| 5 |
plotly==5.10.0
|
| 6 |
newspaper3k==0.2.8
|
| 7 |
+
PyPDF2==2.10.7
|
| 8 |
+
pytesseract==0.3.10
|