Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| """ | |
| utils.py - Utility functions for the project. | |
| """ | |
| import re | |
| import subprocess | |
| from collections import defaultdict | |
| from datetime import datetime | |
| from itertools import combinations | |
| from pathlib import Path | |
| from typing import List | |
| import nltk | |
| import torch | |
| from natsort import natsorted | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import sent_tokenize, word_tokenize | |
| from rapidfuzz import fuzz | |
| nltk.download("punkt", quiet=True) | |
| nltk.download( | |
| "popular", | |
| quiet=True, | |
| ) | |
| def validate_pytorch2(torch_version: str = None): | |
| torch_version = torch.__version__ if torch_version is None else torch_version | |
| pattern = r"^2\.\d+(\.\d+)*" | |
| return True if re.match(pattern, torch_version) else False | |
| def get_timestamp() -> str: | |
| """ | |
| get_timestamp - get a timestamp for the current time | |
| Returns: | |
| str, the timestamp | |
| """ | |
| return datetime.now().strftime("%Y%m%d_%H%M%S") | |
| def truncate_word_count(text, max_words=512): | |
| """ | |
| truncate_word_count - a helper function for the gradio module | |
| Parameters | |
| ---------- | |
| text : str, required, the text to be processed | |
| max_words : int, optional, the maximum number of words, default=512 | |
| Returns | |
| ------- | |
| dict, the text and whether it was truncated | |
| """ | |
| # split on whitespace with regex | |
| words = re.split(r"\s+", text) | |
| processed = {} | |
| if len(words) > max_words: | |
| processed["was_truncated"] = True | |
| processed["truncated_text"] = " ".join(words[:max_words]) | |
| else: | |
| processed["was_truncated"] = False | |
| processed["truncated_text"] = text | |
| return processed | |
| def load_examples(src, filetypes=[".txt", ".pdf"]): | |
| """ | |
| load_examples - a helper function for the gradio module to load examples | |
| Returns: | |
| list of str, the examples | |
| """ | |
| src = Path(src) | |
| src.mkdir(exist_ok=True) | |
| pdf_url = ( | |
| "https://www.dropbox.com/s/y92xy7o5qb88yij/all_you_need_is_attention.pdf?dl=1" | |
| ) | |
| subprocess.run(["wget", pdf_url, "-O", src / "all_you_need_is_attention.pdf"]) | |
| examples = [f for f in src.iterdir() if f.suffix in filetypes] | |
| examples = natsorted(examples) | |
| # load the examples into a list | |
| text_examples = [] | |
| for example in examples: | |
| with open(example, "r") as f: | |
| text = f.read() | |
| text_examples.append([text, "base", 2, 1024, 0.7, 3.5, 3]) | |
| return text_examples | |
| def load_example_filenames(example_path: str or Path): | |
| """ | |
| load_example_filenames - a helper function for the gradio module to load examples | |
| Returns: | |
| dict, the examples (filename:full path) | |
| """ | |
| example_path = Path(example_path) | |
| # load the examples into a list | |
| examples = {f.name: f for f in example_path.glob("*.txt")} | |
| return examples | |
| def extract_keywords(text: str, num_keywords: int = 3) -> List[str]: | |
| """ | |
| Extracts keywords from a text using the TextRank algorithm. | |
| Args: | |
| text: The text to extract keywords from. | |
| num_keywords: The number of keywords to extract. Default is 5. | |
| Returns: | |
| A list of strings, where each string is a keyword extracted from the input text. | |
| """ | |
| # Remove stopwords from the input text | |
| stop_words = set(stopwords.words("english")) | |
| text = " ".join([word for word in text.lower().split() if word not in stop_words]) | |
| # Tokenize the text into sentences and words | |
| sentences = sent_tokenize(text) | |
| words = [word_tokenize(sentence) for sentence in sentences] | |
| # Filter out words that are shorter than 3 characters | |
| words = [[word for word in sentence if len(word) >= 3] for sentence in words] | |
| # Create a graph of word co-occurrences | |
| cooccur = defaultdict(lambda: defaultdict(int)) | |
| for sentence in words: | |
| for w1, w2 in combinations(sentence, 2): | |
| cooccur[w1][w2] += 1 | |
| cooccur[w2][w1] += 1 | |
| # Assign scores to words using the TextRank algorithm | |
| scores = defaultdict(float) | |
| for i in range(10): | |
| for word in cooccur: | |
| score = 0.15 + 0.85 * sum( | |
| cooccur[word][other] / sum(cooccur[other].values()) * scores[other] | |
| for other in cooccur[word] | |
| ) | |
| scores[word] = score | |
| # Sort the words by score and return the top num_keywords keywords | |
| keywords = sorted(scores, key=scores.get, reverse=True)[:num_keywords] | |
| # Use fuzzy matching to remove similar keywords | |
| final_keywords = [] | |
| for keyword in keywords: | |
| if not any(fuzz.ratio(keyword, other) > 70 for other in final_keywords): | |
| final_keywords.append(keyword) | |
| return final_keywords | |
| def saves_summary( | |
| summarize_output, outpath: str or Path = None, add_signature=True, **kwargs | |
| ): | |
| """ | |
| saves_summary - save the summary generated from summarize_via_tokenbatches() to a text file | |
| summarize_output: output from summarize_via_tokenbatches() | |
| outpath: path to the output file | |
| add_signature: whether to add a signature to the output file | |
| kwargs: additional keyword arguments to include in the output file | |
| """ | |
| sum_text = [f"\t{s['summary'][0]}\n" for s in summarize_output] | |
| sum_scores = [f"\n - {round(s['summary_score'],4)}" for s in summarize_output] | |
| scores_text = "\n".join(sum_scores) | |
| full_summary = "\n".join(sum_text) | |
| keywords = "_".join(extract_keywords(full_summary)) | |
| outpath = ( | |
| Path.cwd() / f"document_summary_{get_timestamp()}_{keywords}.txt" | |
| if outpath is None | |
| else Path(outpath) | |
| ) | |
| with open( | |
| outpath, | |
| "w", | |
| encoding="utf-8", | |
| ) as fo: | |
| fo.writelines(full_summary) | |
| fo.write("\n\n") | |
| if add_signature: | |
| fo.write("\n\n---\n\n") | |
| fo.write("Generated with the Document Summarization space :)\n\n") | |
| fo.write("https://hf.co/spaces/pszemraj/document-summarization\n\n") | |
| with open( | |
| outpath, | |
| "a", | |
| ) as fo: | |
| fo.write("\n") | |
| fo.write(f"## Section Scores:\n\n") | |
| fo.writelines(scores_text) | |
| fo.write("\n\n") | |
| fo.write(f"Date: {get_timestamp()}\n\n") | |
| if kwargs: | |
| fo.write("---\n\n") | |
| fo.write("## Parameters:\n\n") | |
| for key, value in kwargs.items(): | |
| fo.write(f"{key}: {value}\n") | |
| return outpath | |