Update app.py
Browse files
app.py
CHANGED
|
@@ -16,11 +16,10 @@ import docx2txt
|
|
| 16 |
from io import StringIO
|
| 17 |
from PyPDF2 import PdfFileReader
|
| 18 |
import warnings
|
| 19 |
-
import nltk
|
| 20 |
|
| 21 |
-
nltk.
|
| 22 |
|
| 23 |
-
from nltk import sent_tokenize
|
| 24 |
warnings.filterwarnings("ignore")
|
| 25 |
|
| 26 |
|
|
@@ -71,7 +70,7 @@ def article_text_extractor(url: str):
|
|
| 71 |
|
| 72 |
def chunk_clean_text(text):
|
| 73 |
|
| 74 |
-
sentences =
|
| 75 |
current_chunk = 0
|
| 76 |
chunks = []
|
| 77 |
|
|
|
|
| 16 |
from io import StringIO
|
| 17 |
from PyPDF2 import PdfFileReader
|
| 18 |
import warnings
|
| 19 |
+
import nltk.data
|
| 20 |
|
| 21 |
+
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
|
| 22 |
|
|
|
|
| 23 |
warnings.filterwarnings("ignore")
|
| 24 |
|
| 25 |
|
|
|
|
| 70 |
|
| 71 |
def chunk_clean_text(text):
|
| 72 |
|
| 73 |
+
sentences = tokenizer(text)
|
| 74 |
current_chunk = 0
|
| 75 |
chunks = []
|
| 76 |
|