Spaces:
Runtime error
Runtime error
Updated the file reading for the app
Browse filesError Fix: https://stackoverflow.com/questions/51337167/typeerror-stat-path-should-be-string-bytes-os-pathlike-or-integer-not-io-t
- pdftoqa_generator.py +22 -16
pdftoqa_generator.py
CHANGED
|
@@ -11,29 +11,35 @@ from langchain.text_splitter import (
|
|
| 11 |
RecursiveCharacterTextSplitter,
|
| 12 |
)
|
| 13 |
from tqdm import tqdm
|
|
|
|
| 14 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
| 15 |
|
| 16 |
os.environ["OPENAI_API_KEY"] = "sk-"
|
| 17 |
|
| 18 |
|
| 19 |
def pdf_parser(file_path):
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
def qa_generator(texts):
|
|
|
|
| 11 |
RecursiveCharacterTextSplitter,
|
| 12 |
)
|
| 13 |
from tqdm import tqdm
|
| 14 |
+
from tempfile import NamedTemporaryFile
|
| 15 |
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
| 16 |
|
| 17 |
os.environ["OPENAI_API_KEY"] = "sk-"
|
| 18 |
|
| 19 |
|
| 20 |
def pdf_parser(file_path):
|
| 21 |
+
bytes_data = uploaded_file.read()
|
| 22 |
+
with NamedTemporaryFile(delete=False) as tmp: # open a named temporary file
|
| 23 |
+
tmp.write(bytes_data) # Write data from the uploaded file into it
|
| 24 |
+
pdf_loader = PyPDFLoader(tmp.name) # <---- now it works!
|
| 25 |
|
| 26 |
+
#pdf_loader = PyPDFLoader(file_path) only for file path offline
|
| 27 |
+
|
| 28 |
+
documents = pdf_loader.load()
|
| 29 |
+
documents_text = [d.page_content for d in documents]
|
| 30 |
+
|
| 31 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 32 |
+
# Set a really small chunk size, just to show.
|
| 33 |
+
chunk_size=600,
|
| 34 |
+
chunk_overlap=200,
|
| 35 |
+
length_function=len,
|
| 36 |
+
is_separator_regex=False,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# Split the text into chunks
|
| 40 |
+
texts = text_splitter.create_documents(documents_text)
|
| 41 |
+
os.remove(tmp.name) # remove temp file
|
| 42 |
+
return texts
|
| 43 |
|
| 44 |
|
| 45 |
def qa_generator(texts):
|