Update app.py
Browse files
app.py
CHANGED
|
@@ -16,6 +16,7 @@ import docx2txt
|
|
| 16 |
from io import StringIO
|
| 17 |
from PyPDF2 import PdfFileReader
|
| 18 |
import warnings
|
|
|
|
| 19 |
warnings.filterwarnings("ignore")
|
| 20 |
|
| 21 |
|
|
@@ -63,6 +64,28 @@ def article_text_extractor(url: str):
|
|
| 63 |
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
| 64 |
|
| 65 |
return article_header, chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
def preprocess_plain_text(x):
|
| 68 |
|
|
@@ -85,6 +108,7 @@ def extract_pdf(file):
|
|
| 85 |
for i in range(count):
|
| 86 |
page = pdfReader.getPage(i)
|
| 87 |
all_text += page.extractText()
|
|
|
|
| 88 |
|
| 89 |
return all_text
|
| 90 |
|
|
@@ -199,11 +223,11 @@ if is_url:
|
|
| 199 |
|
| 200 |
elif upload_doc:
|
| 201 |
|
| 202 |
-
clean_text = preprocess_plain_text(extract_text_from_file(upload_doc))
|
| 203 |
|
| 204 |
else:
|
| 205 |
|
| 206 |
-
clean_text = preprocess_plain_text(plain_text)
|
| 207 |
|
| 208 |
summarize = st.button("Summarize")
|
| 209 |
|
|
|
|
| 16 |
from io import StringIO
|
| 17 |
from PyPDF2 import PdfFileReader
|
| 18 |
import warnings
|
| 19 |
+
from nltk import sent_tokenize
|
| 20 |
warnings.filterwarnings("ignore")
|
| 21 |
|
| 22 |
|
|
|
|
| 64 |
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
| 65 |
|
| 66 |
return article_header, chunks
|
| 67 |
+
|
| 68 |
+
def chunk_clean_text(text):
|
| 69 |
+
|
| 70 |
+
sentences = sent_tokenize(text)
|
| 71 |
+
current_chunk = 0
|
| 72 |
+
chunks = []
|
| 73 |
+
|
| 74 |
+
for sentence in sentences:
|
| 75 |
+
if len(chunks) == current_chunk + 1:
|
| 76 |
+
if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
|
| 77 |
+
chunks[current_chunk].extend(sentence.split(" "))
|
| 78 |
+
else:
|
| 79 |
+
current_chunk += 1
|
| 80 |
+
chunks.append(sentence.split(" "))
|
| 81 |
+
else:
|
| 82 |
+
print(current_chunk)
|
| 83 |
+
chunks.append(sentence.split(" "))
|
| 84 |
+
|
| 85 |
+
for chunk_id in range(len(chunks)):
|
| 86 |
+
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
| 87 |
+
|
| 88 |
+
return chunks
|
| 89 |
|
| 90 |
def preprocess_plain_text(x):
|
| 91 |
|
|
|
|
| 108 |
for i in range(count):
|
| 109 |
page = pdfReader.getPage(i)
|
| 110 |
all_text += page.extractText()
|
| 111 |
+
|
| 112 |
|
| 113 |
return all_text
|
| 114 |
|
|
|
|
| 223 |
|
| 224 |
elif upload_doc:
|
| 225 |
|
| 226 |
+
clean_text = chunk_clean_text(preprocess_plain_text(extract_text_from_file(upload_doc)))
|
| 227 |
|
| 228 |
else:
|
| 229 |
|
| 230 |
+
clean_text = chunk_clean_text(preprocess_plain_text(plain_text))
|
| 231 |
|
| 232 |
summarize = st.button("Summarize")
|
| 233 |
|