Spaces:
Sleeping
Sleeping
| ## SCRIPT TO UPDATE THE FACT CHECK DATA | |
| ####################################### | |
| from pinecone.grpc import PineconeGRPC as Pinecone | |
| import os | |
| import pandas as pd | |
| import numpy as np | |
| from pinecone import ServerlessSpec | |
| from pinecone_text.sparse import BM25Encoder | |
| import sys | |
| sys.path.append('src/python') | |
| import DataLoader | |
| pc = Pinecone(api_key="5faec954-a6c5-4af5-a577-89dbd2e4e5b0", pool_threads=50) # <-- make sure to set this) | |
| ############################## | |
| df = pd.read_csv('data/fact_check_latest.csv') | |
| # Drop non-unique text values | |
| df = df.drop_duplicates(subset=['text']) | |
| # skip rows where text is NaN | |
| df = df.dropna(subset=['text']) | |
| ## for 'claimReviewTitle' and 'claimReviewUrl' columns, fill NaN with empty string | |
| df['claimReviewUrl'] = df['claimReviewUrl'].fillna('') | |
| # now, check for NaN values in 'claimReviewUrl' column | |
| ## get top three rows | |
| # get text and MessageID | |
| bm25, newdf = DataLoader.create_sparse_embeds(pc, df) | |
| #metadata = df[['text', 'category', 'claimReviewTitle', 'claimReviewUrl']].to_dict(orient='records') | |
| metadata = df[['text', 'claimReviewUrl']].to_dict(orient='records') | |
| newdf.loc[:, 'metadata'] = metadata | |
| ## Taka look at rows where sparse values is an empty array | |
| sparse_lengths = [len(x) for x in newdf['sparse_values']] | |
| ## Drop newdf rows where sparse length is | |
| newdf = newdf[np.array(sparse_lengths) != 0].reset_index(drop=True) | |
| vecs = DataLoader.create_sparse_dense_dict(newdf) | |
| index = pc.Index("oc-hybrid-library-index") | |
| for i in range(0, len(vecs), 400): | |
| end_index = min(i + 400, len(vecs)) | |
| index.upsert(vecs[i:end_index], namespace="expanded-fact-checks") | |
| print(f"Upserted vectors") | |
| ##################################### | |
| ### Querying performance for TruthSeeker Subset | |
| df = pd.read_csv('data/truthseeker_subsample.csv') | |
| corpus = df['claim'].tolist() | |
| """ | |
| ## Function query, return score, title, link | |
| Example: get_score_title_link(corpus[0], pc, index) | |
| """ | |
| def get_score_title_link(querytext, pc, index): | |
| queryembed = DataLoader.query_embed(pc, "multilingual-e5-large", querytext) | |
| empty_sparse = DataLoader.empty_sparse_vector() | |
| res = index.query( | |
| top_k=1, | |
| namespace="expanded-fact-checks", | |
| vector=queryembed, | |
| sparse_vector=empty_sparse, | |
| include_metadata=True | |
| ) | |
| score = res['matches'][0]['score'] | |
| title = res['matches'][0]['metadata']['text'] | |
| link = res['matches'][0]['metadata']['claimReviewUrl'] | |
| return pd.Series([score, title, link], index=['score', 'title', 'link']) | |
| ## Get score, title, link for each querytext in corpus | |
| import time | |
| from pinecone.grpc import PineconeGRPC | |
| pc = PineconeGRPC(api_key="5faec954-a6c5-4af5-a577-89dbd2e4e5b0") # <-- make sure to set this) | |
| index = pc.Index( | |
| name="oc-hybrid-library-index", | |
| pool_threads=50, # <-- make sure to set this | |
| ) | |
| ### TIMING | |
| start_time = time.time() | |
| df[['score', 'title', 'link']] = df['claim'].apply(get_score_title_link, args=(pc, index)) #send the claim column to be scored. | |
| elapsed_time = time.time() - start_time | |
| print(f"Time taken: {elapsed_time:.2f} seconds") | |
| ######## END TIMING | |