Spaces:
Sleeping
Sleeping
=
Add compression and decompression functions for fact check data; update dependencies and remove obsolete files
fd342b4
| ## Script to update the fact-check data for HuggingFace API | |
| using OstreaCultura | |
| # Get the latest data to embed | |
| #include("../scripts/google_fact_check_api.jl") | |
| include("scripts/google_fact_check_api.jl") | |
| # Latest data | |
| #get_latest_fact_checks() | |
| fc, errors = load_fact_check_json() | |
| ## Embed with MiniEncoder or MTR5-encoder | |
| #fc = CSV.read("data/fact_check_latest.csv", DataFrame) | |
| # drop missing text | |
| fc = fc[.!ismissing.(fc.text), :] | |
| fc_embed = mini_embed.(fc.text) # 12 minutes | |
| fc.Embeddings = fc_embed | |
| CSV.write("data/fact_check_latest_embed_mini.csv", fc) | |
| ## Embed with MTR5-encoder | |
| fc = fc[.!ismissing.(fc.text), :] | |
| # Drop where text = "" | |
| fc = fc[fc.text .!= "", :] | |
| fc_embed = OstreaCultura.maxi_embed.(fc.text) # | |
| fc.Embeddings = fc_embed | |
| CSV.write("data/fact_check_latest_embed_maxi.csv", fc) | |
| narrs = CSV.read("data/expansive_claims_library_expanded_embed.csv", DataFrame) | |
| # drop missing text | |
| narrs.text = narrs.ExpandedClaim | |
| narrs = narrs[.!ismissing.(narrs.text), :] | |
| narratives_embed = OC.maxi_embed.(narrs.text) # seconds to run | |
| narrs.Embeddings = narratives_embed | |
| CSV.write("data/expansive_claims_library_expanded_embed_maxi.csv", narrs) | |
| # Compress the fact check data | |
| OC.compress_csv("data/fact_check_latest_embed_maxi.csv", "data/fc_latest_maxi_compr") | |
| # Delete the original | |
| rm("data/fact_check_latest_embed_maxi.csv") | |