Update app.py
Browse files
app.py
CHANGED
|
@@ -60,11 +60,8 @@ def article_text_extractor(url: str):
|
|
| 60 |
|
| 61 |
article_header = ''
|
| 62 |
|
| 63 |
-
article = " ".join(article_text)
|
| 64 |
-
|
| 65 |
-
article = article.replace("!", "!<eos>")
|
| 66 |
-
article = article.replace("?", "?<eos>")
|
| 67 |
-
sentences = article.split("<eos>")
|
| 68 |
|
| 69 |
current_chunk = 0
|
| 70 |
chunks = []
|
|
@@ -77,7 +74,6 @@ def article_text_extractor(url: str):
|
|
| 77 |
current_chunk += 1
|
| 78 |
chunks.append(sentence.split(" "))
|
| 79 |
else:
|
| 80 |
-
print(current_chunk)
|
| 81 |
chunks.append(sentence.split(" "))
|
| 82 |
|
| 83 |
for chunk_id in range(len(chunks)):
|
|
@@ -86,8 +82,12 @@ def article_text_extractor(url: str):
|
|
| 86 |
return article_header, chunks
|
| 87 |
|
| 88 |
def chunk_clean_text(text):
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
current_chunk = 0
|
| 92 |
chunks = []
|
| 93 |
|
|
@@ -99,9 +99,8 @@ def chunk_clean_text(text):
|
|
| 99 |
current_chunk += 1
|
| 100 |
chunks.append(sentence.split(" "))
|
| 101 |
else:
|
| 102 |
-
print(current_chunk)
|
| 103 |
chunks.append(sentence.split(" "))
|
| 104 |
-
|
| 105 |
for chunk_id in range(len(chunks)):
|
| 106 |
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
| 107 |
|
|
@@ -259,10 +258,10 @@ def highlight_entities(article_content,summary_output):
|
|
| 259 |
print(summary_output)
|
| 260 |
|
| 261 |
for entity in matched_entities:
|
| 262 |
-
summary_output =
|
| 263 |
|
| 264 |
for entity in unmatched_entities:
|
| 265 |
-
summary_output = summary_output.
|
| 266 |
|
| 267 |
print("")
|
| 268 |
print(summary_output)
|
|
|
|
| 60 |
|
| 61 |
article_header = ''
|
| 62 |
|
| 63 |
+
article = nlp(" ".join(article_text))
|
| 64 |
+
sentences = [i.text for i in list(article.sents)]
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
current_chunk = 0
|
| 67 |
chunks = []
|
|
|
|
| 74 |
current_chunk += 1
|
| 75 |
chunks.append(sentence.split(" "))
|
| 76 |
else:
|
|
|
|
| 77 |
chunks.append(sentence.split(" "))
|
| 78 |
|
| 79 |
for chunk_id in range(len(chunks)):
|
|
|
|
| 82 |
return article_header, chunks
|
| 83 |
|
| 84 |
def chunk_clean_text(text):
|
| 85 |
+
|
| 86 |
+
"""Chunk text longer than 500 tokens"""
|
| 87 |
+
|
| 88 |
+
article = nlp(" ".join(text))
|
| 89 |
+
sentences = [i.text for i in list(article.sents)]
|
| 90 |
+
|
| 91 |
current_chunk = 0
|
| 92 |
chunks = []
|
| 93 |
|
|
|
|
| 99 |
current_chunk += 1
|
| 100 |
chunks.append(sentence.split(" "))
|
| 101 |
else:
|
|
|
|
| 102 |
chunks.append(sentence.split(" "))
|
| 103 |
+
|
| 104 |
for chunk_id in range(len(chunks)):
|
| 105 |
chunks[chunk_id] = " ".join(chunks[chunk_id])
|
| 106 |
|
|
|
|
| 258 |
print(summary_output)
|
| 259 |
|
| 260 |
for entity in matched_entities:
|
| 261 |
+
summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_green + entity + markdown_end,summary_output)
|
| 262 |
|
| 263 |
for entity in unmatched_entities:
|
| 264 |
+
summary_output = summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_red + entity + markdown_end,summary_output)
|
| 265 |
|
| 266 |
print("")
|
| 267 |
print(summary_output)
|