Spaces:

ugaray96
/

neural-search

Runtime error

ugmSorcero commited on Sep 14, 2022

Commit

46323da

1 Parent(s): 753ae25

Adds image to text and tesseract linux dependencies

Files changed (5) hide show

interface/components.py CHANGED Viewed

@@ -80,6 +80,11 @@ def component_article_url(container):
                     st.markdown("---")
                 else:
                     break
         corpus = [
             {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
         ]
@@ -93,7 +98,7 @@ def component_file_input(container):
         doc_id = 1
         with st.expander("Enter Files"):
             while True:
-                file = st.file_uploader("Upload a .txt, .pdf, .csv file", key=doc_id)
                 if file != None:
                     extracted_text = extract_text_from_file(file)
                     if extracted_text != None:
@@ -104,6 +109,11 @@ def component_file_input(container):
                         break
                 else:
                     break
         corpus = [
             {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
         ]

                     st.markdown("---")
                 else:
                     break
+        for idx, doc in enumerate(urls):
+            with st.expander(f"Preview URL {idx}"):
+                st.write(doc)
         corpus = [
             {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(urls)
         ]
         doc_id = 1
         with st.expander("Enter Files"):
             while True:
+                file = st.file_uploader("Upload a .txt, .pdf, .csv, image file", key=doc_id)
                 if file != None:
                     extracted_text = extract_text_from_file(file)
                     if extracted_text != None:
                         break
                 else:
                     break
+        for idx, doc in enumerate(files):
+            with st.expander(f"Preview File {idx}"):
+                st.write(doc)
         corpus = [
             {"text": doc["text"], "id": doc_id} for doc_id, doc in enumerate(files)
         ]

interface/pages.py CHANGED Viewed

@@ -26,7 +26,6 @@ def page_landing_page(container):
         st.markdown(
             "TODO list:"
             "\n  - Build other pipelines"
-            "\n  - Include file/url indexing"
             "\n  - [Optional] Include text to audio to read responses"
         )

         st.markdown(
             "TODO list:"
             "\n  - Build other pipelines"
             "\n  - [Optional] Include text to audio to read responses"
         )

interface/utils.py CHANGED Viewed

@@ -5,7 +5,8 @@ from newspaper import Article
 from PyPDF2 import PdfFileReader
 import streamlit as st
 import pandas as pd
 def get_pipelines():
     pipeline_names, pipeline_funcs = list(
@@ -25,7 +26,7 @@ def extract_text_from_url(url: str):
     return article.text
 def extract_text_from_file(file):
     # read text file
     if file.type == "text/plain":
@@ -76,6 +77,10 @@ def extract_text_from_file(file):
                     continue
                 file_text += " " + txt
         return file_text
     else:
         st.warning(f"File type {file.type} not supported")

 from PyPDF2 import PdfFileReader
 import streamlit as st
 import pandas as pd
+import pytesseract
+from PIL import Image
 def get_pipelines():
     pipeline_names, pipeline_funcs = list(
     return article.text
+@st.experimental_memo
 def extract_text_from_file(file):
     # read text file
     if file.type == "text/plain":
                     continue
                 file_text += " " + txt
         return file_text
+    # read image file (OCR)
+    elif file.type == 'image/jpeg':
+        return pytesseract.image_to_string(Image.open(file))
     else:
         st.warning(f"File type {file.type} not supported")

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ tesseract-ocr-all

requirements.txt CHANGED Viewed

@@ -4,4 +4,5 @@ farm-haystack==1.8.0
 black==22.8.0
 plotly==5.10.0
 newspaper3k==0.2.8
-PyPDF2==2.10.7

 black==22.8.0
 plotly==5.10.0
 newspaper3k==0.2.8
+PyPDF2==2.10.7
+pytesseract==0.3.10