| import gradio as gr | |
| import requests | |
| import re | |
| import fitz | |
| def extract_text_from_pdf(pdf_file_path): | |
| doc = fitz.open(pdf_file_path) | |
| text = "" | |
| for page in doc: | |
| text+=page.get_text() | |
| return text | |
| API_URL = "/static-proxy?url=https%3A%2F%2Fapi-inference.huggingface.co%2Fmodels%2Fpotsawee%2Ft5-large-generation-squad-QuestionAnswer%26quot%3B%3C%2Fspan%3E%3C!-- HTML_TAG_END --> | |
| headers = {"Authorization": "Bearer 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"} | |
| def query(payload): | |
| response = requests.post(API_URL, headers=headers, json=payload) | |
| return response.json() | |
| def generate_question_answer_pairs(input_file): | |
| if input_file is None: | |
| return "Please upload a file" | |
| pdf_text = extract_text_from_pdf(input_file) | |
| sentences = re.split(r'(?<=[.!?])', pdf_text) | |
| outputs = [] | |
| result = '' | |
| for sentence in sentences: | |
| if sentence.strip(): | |
| output = query({ | |
| "inputs": sentence, | |
| }) | |
| outputs.append(output) | |
| for i in outputs: | |
| if type(i) == dict: | |
| continue | |
| pair = i[0]['generated_text'] | |
| question = re.search(r'^(.*?\?)', pair) | |
| answer = re.search(r'\?(.*)', pair) | |
| if question and answer: | |
| question_part = question.group(1).strip() | |
| answer_part = answer.group(1).strip() | |
| result += f"Question: {question_part}\nAnswer: {answer_part}\n\n" | |
| return result | |
| title = "Question-Answer Pairs Generation" | |
| input_file = gr.File(label="Upload a PDF file") | |
| output_text = gr.Textbox() | |
| interface = gr.Interface( | |
| fn=generate_question_answer_pairs, | |
| inputs=input_file, | |
| outputs=output_text, | |
| title=title, | |
| ) | |
| interface.launch() |