Spaces:

stefanjwojcik
/

misinfo_detection_app

Sleeping

misinfo_detection_app / scripts /UpdateHuggingFaceAPI.jl

Add compression and decompression functions for fact check data; update dependencies and remove obsolete files

fd342b4 6 months ago

1.34 kB

	## Script to update the fact-check data for HuggingFace API
	using OstreaCultura

	# Get the latest data to embed
	#include("../scripts/google_fact_check_api.jl")
	include("scripts/google_fact_check_api.jl")

	# Latest data
	#get_latest_fact_checks()
	fc, errors = load_fact_check_json()

	## Embed with MiniEncoder or MTR5-encoder
	#fc = CSV.read("data/fact_check_latest.csv", DataFrame)
	# drop missing text
	fc = fc[.!ismissing.(fc.text), :]
	fc_embed = mini_embed.(fc.text) # 12 minutes
	fc.Embeddings = fc_embed
	CSV.write("data/fact_check_latest_embed_mini.csv", fc)

	## Embed with MTR5-encoder
	fc = fc[.!ismissing.(fc.text), :]
	# Drop where text = ""
	fc = fc[fc.text .!= "", :]
	fc_embed = OstreaCultura.maxi_embed.(fc.text) #
	fc.Embeddings = fc_embed
	CSV.write("data/fact_check_latest_embed_maxi.csv", fc)


	narrs = CSV.read("data/expansive_claims_library_expanded_embed.csv", DataFrame)
	# drop missing text
	narrs.text = narrs.ExpandedClaim
	narrs = narrs[.!ismissing.(narrs.text), :]
	narratives_embed = OC.maxi_embed.(narrs.text) # seconds to run
	narrs.Embeddings = narratives_embed
	CSV.write("data/expansive_claims_library_expanded_embed_maxi.csv", narrs)
	# Compress the fact check data
	OC.compress_csv("data/fact_check_latest_embed_maxi.csv", "data/fc_latest_maxi_compr")
	# Delete the original
	rm("data/fact_check_latest_embed_maxi.csv")