Spaces:

Sawon2023
/

llm-pdf-qa

Runtime error

App Files Files Community

Sawon2023 commited on Sep 19, 2023

Commit

68c32d7

1 Parent(s): c78d4ac

Q&A Generator from PDF (Text not Image)

Browse files

Files changed (4) hide show

.env +1 -0
README.md +28 -13
app.py +37 -0
pdftoqa_generator.py +72 -0

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ OPENAI_API_KEY="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"

README.md CHANGED Viewed

@@ -1,13 +1,28 @@
----
-title: Llm Pdf Qa
-emoji: 📉
-colorFrom: green
-colorTo: blue
-sdk: gradio
-sdk_version: 3.44.4
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+## Make Question and Answer from your PDF
+### Setup Environment:
+1.  Create an account in https://openai.com/ and generate your own API_KEY
+2.  Download the following libraries and packages:
+    a.  !pip install langchain
+    b.  !pip install pypdf
+    c.  !pip install transformers==4.33.1
+        This particular package will install the following dependencies:
+        1. huggingface-hub-0.17.1
+        2. safetensors-0.3.3
+        3. tokenizers-0.13.3
+    d. !pip install gradio
+### Run the System
+1.  Run the file:
+```
+python3 app.py
+```
+2. Copy the url from terminal and paste in the browser
+3. Upload your PDF & Get the Questions from each page of the PDF

app.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import json
+import os
+import re
+import statistics
+import gradio as gr
+import pandas as pd
+from pdftoqa_generator import *
+def predict(file):
+    resource = pdf_parser(file)
+    qa_notes = qa_generator(resource)
+    return qa_notes
+description = """Do you have a long document and a bunch of questions that can be answered given the data in this file?
+Fear not for this demo is for you.
+Upload your pdf, ask your questions and wait for the magic to happen.
+DISCLAIMER: I do no have idea what happens to the pdfs that you upload and who has access to them so make sure there is nothing confidential there.
+"""
+title = "QA answering from a pdf."
+iface = gr.Interface(
+    fn=predict,
+    inputs=[
+        gr.inputs.File(),
+    ],
+    outputs="text",
+    description=description,
+    title=title,
+    allow_screenshot=True,
+)
+iface.launch(enable_queue=True, show_error=True)

pdftoqa_generator.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import json
+import os
+import re
+import statistics
+import gradio as gr
+import pandas as pd
+from langchain.document_loaders import PyPDFLoader
+from langchain.text_splitter import (
+    CharacterTextSplitter,
+    RecursiveCharacterTextSplitter,
+)
+from tqdm import tqdm
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+os.environ["OPENAI_API_KEY"] = "sk-"
+def pdf_parser(file_path):
+    pdf_loader = PyPDFLoader(file_path)
+    documents = pdf_loader.load()
+    documents_text = [d.page_content for d in documents]
+    text_splitter = RecursiveCharacterTextSplitter(
+        # Set a really small chunk size, just to show.
+        chunk_size=600,
+        chunk_overlap=200,
+        length_function=len,
+        is_separator_regex=False,
+    )
+    # Split the text into chunks
+    texts = text_splitter.create_documents(documents_text)
+    return texts
+def qa_generator(texts):
+    question_tokenizer = AutoTokenizer.from_pretrained(
+        "potsawee/t5-large-generation-squad-QuestionAnswer"
+    )
+    question_model = AutoModelForSeq2SeqLM.from_pretrained(
+        "potsawee/t5-large-generation-squad-QuestionAnswer"
+    )
+    question_answer_dic = {}
+    for i in tqdm(texts):
+        context = i.page_content
+        try:
+            inputs = question_tokenizer(context, return_tensors="pt")
+            outputs = question_model.generate(**inputs, max_length=100)
+            question_answer = question_tokenizer.decode(
+                outputs[0], skip_special_tokens=False
+            )
+            question_answer = question_answer.replace(
+                question_tokenizer.pad_token, ""
+            ).replace(question_tokenizer.eos_token, "")
+            question, answer = question_answer.split(question_tokenizer.sep_token)
+            question_answer_dic[question] = answer
+        except:
+            print(i)
+    qa_notes_df = pd.DataFrame(data=[], columns=["No", "Question", "Answer"])
+    qa_notes_df["No"] = [i + 1 for i in range(0, len(question_answer_dic))]
+    qa_notes_df["Question"] = [k for k in question_answer_dic.keys()]
+    qa_notes_df["Answer"] = [a for a in question_answer_dic.values()]
+    qa_notes_json = qa_notes_df.to_dict("records")
+    return qa_notes_json