Spaces:

nickmuchi
/

article-text-summarizer

Running

nickmuchi commited on Apr 20, 2022

Commit

2fd9d6b

1 Parent(s): 1b5f436

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ import docx2txt
 from io import StringIO
 from PyPDF2 import PdfFileReader
 import warnings
 warnings.filterwarnings("ignore")
@@ -63,6 +64,28 @@ def article_text_extractor(url: str):
         chunks[chunk_id] = " ".join(chunks[chunk_id])
     return article_header, chunks
 def preprocess_plain_text(x):
@@ -85,6 +108,7 @@ def extract_pdf(file):
     for i in range(count):
         page = pdfReader.getPage(i)
         all_text += page.extractText()
     return all_text
@@ -199,11 +223,11 @@ if is_url:
 elif upload_doc:
-    clean_text = preprocess_plain_text(extract_text_from_file(upload_doc))
 else:
-    clean_text = preprocess_plain_text(plain_text)
 summarize = st.button("Summarize")

 from io import StringIO
 from PyPDF2 import PdfFileReader
 import warnings
+from nltk import sent_tokenize
 warnings.filterwarnings("ignore")
         chunks[chunk_id] = " ".join(chunks[chunk_id])
     return article_header, chunks
+def chunk_clean_text(text):
+    sentences = sent_tokenize(text)
+    current_chunk = 0
+    chunks = []
+    for sentence in sentences:
+        if len(chunks) == current_chunk + 1:
+            if len(chunks[current_chunk]) + len(sentence.split(" ")) <= 500:
+                chunks[current_chunk].extend(sentence.split(" "))
+            else:
+                current_chunk += 1
+                chunks.append(sentence.split(" "))
+        else:
+            print(current_chunk)
+            chunks.append(sentence.split(" "))
+    for chunk_id in range(len(chunks)):
+        chunks[chunk_id] = " ".join(chunks[chunk_id])
+    return chunks
 def preprocess_plain_text(x):
     for i in range(count):
         page = pdfReader.getPage(i)
         all_text += page.extractText()
     return all_text
 elif upload_doc:
+    clean_text = chunk_clean_text(preprocess_plain_text(extract_text_from_file(upload_doc)))
 else:
+    clean_text = chunk_clean_text(preprocess_plain_text(plain_text))
 summarize = st.button("Summarize")