Spaces:
Build error
Build error
| import numpy, nltk | |
| nltk.download('punkt') | |
| from harvesttext import HarvestText | |
| from lex_rank_util import degree_centrality_scores, find_siblings_by_index | |
| from sentence_transformers import SentenceTransformer, util | |
| class LexRankL12(object): | |
| def __init__(self): | |
| self.model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') | |
| self.ht = HarvestText() | |
| def find_central(self, content: str, num=10, siblings=0): | |
| if self.contains_chinese(content): | |
| sentences = self.ht.cut_sentences(content) | |
| else: | |
| sentences = nltk.sent_tokenize(content) | |
| embeddings = self.model.encode(sentences, convert_to_tensor=True).cpu() | |
| # Compute the pair-wise cosine similarities | |
| cos_scores = util.cos_sim(embeddings, embeddings).numpy() | |
| # Compute the centrality for each sentence | |
| centrality_scores = degree_centrality_scores(cos_scores, threshold=None) | |
| # We argsort so that the first element is the sentence with the highest score | |
| most_central_sentence_indices = numpy.argsort(-centrality_scores) | |
| central_and_siblings = find_siblings_by_index(sentences, most_central_sentence_indices, siblings, num) | |
| res = [] | |
| for index in central_and_siblings: | |
| res.append(sentences[index]) | |
| return res | |
| def contains_chinese(self, content: str): | |
| for _char in content: | |
| if '\u4e00' <= _char <= '\u9fa5': | |
| return True | |
| return False | |