Spaces:

stefanjwojcik
/

misinfo_detection_app

Sleeping

misinfo_detection_app / src /Embeddings.jl

Add new embeddings and update data processing scripts; remove MiniEncoder

05a2a0c 7 months ago

6.79 kB

	## Embeddings

	function string_to_float32_vector(str::String)::Vector{Float32}
	# Remove the "Float32[" prefix and the "]" suffix
	str = strip(str, ['F', 'l', 'o', 'a', 't', '3', '2', '[', ']'])

	# Replace 'f' with 'e' for scientific notation
	str = replace(str, 'f' => 'e')

	# Split the string by commas to get individual elements
	elements = split(str, ",")

	# Convert each element to Float32 and collect into a vector
	return Float32[parse(Float32, strip(el)) for el in elements]
	end

	function dfdat_to_matrix(df::DataFrame, col::Symbol)::Matrix{Float32}
	return hcat([string_to_float32_vector(row[col]) for row in eachrow(df)]...)
	end

	"""
	## Any piece of text longer than 280 characters will be chunked into smaller pieces, and the embeddings will be averaged.

	#Example:
	text = repeat("This is a test. ", 100)
	chunktext = create_chunked_text(text)
	function create_chunked_text(text; chunk_size=280)
	## Chunk the data
	chunks = []
	for chunk in 1:chunk_size:length(text)
	push!(chunks, text[chunk:min(chunk+chunk_size-1, length(text))])
	end
	return chunks
	end
	"""

	function create_chunked_text(text::String; chunk_size::Int=280)
	chunks = []
	start_idx = 1
	while start_idx <= lastindex(text)
	end_idx = start_idx
	for _ in 1:chunk_size
	end_idx = nextind(text, end_idx, 1)
	if end_idx > lastindex(text)
	end_idx = lastindex(text)
	break
	end
	end
	push!(chunks, text[start_idx:end_idx])
	start_idx = nextind(text, end_idx)
	end
	return chunks
	end

	"""
	## Embeddings of text from the small encoder

	text = "This is a test."
	using OstreaCultura: Encoder
	embd = Encoder.get_embeddings(text)

	"""
	function generate_embeddings(text::String)
	try
	return Encoder.get_embeddings(text)
	catch e
	println("Error: ", e)
	return zeros(Float32, 384)
	end
	end

	"""
	## Embeddings of text from the large encoder

	text = "This is a test."
	using OstreaCultura: Encoder
	embd = Encoder.get_embeddings_big(text)
	LinearAlgebra.normalize(embd)
	"""
	function generate_embeddings_large(text::String)
	try
	return Encoder.get_embeddings_big(text)
	catch e
	println("Error: ", e)
	return zeros(Float32, 768)
	end
	end

	"""
	# This is the core function - takes in a string of any length and returns the embeddings

	text = repeat("This is a test. ", 100)
	mini_embed(text)

	# Test to embed truthseeker subsample
	ts = CSV.read("data/truthseeker_subsample.csv", DataFrame)
	ts_embed = mini_embed.(ts.statement) # can embed 3K in 25 seconds
	ts.Embeddings = ts_embed
	CSV.write("data/truthseeker_subsample_embed.csv", ts)

	## embed fact check data
	fc = CSV.read("data/fact_check_latest.csv", DataFrame)
	# drop missing text
	fc = fc[.!ismissing.(fc.text), :]
	fc_embed = mini_embed.(fc.text) # 12 minutes
	fc.Embeddings = fc_embed
	CSV.write("data/fact_check_latest_embed.csv", fc)

	narrs = CSV.read("data/expansive_claims_library_expanded.csv", DataFrame)
	# drop missing text
	narrs.text = narrs.ExpandedClaim
	narrs = narrs[.!ismissing.(narrs.text), :]
	narratives_embed = OC.mini_embed.(narrs.text) # seconds to run
	narrs.Embeddings = narratives_embed
	CSV.write("data/expansive_claims_library_expanded_embed.csv", narrs)

	"""
	function mini_embed(text::String)
	chunked_text = create_chunked_text(text)
	embeddings = generate_embeddings.(chunked_text)
	mean(embeddings)
	end

	function maxi_embed(text::String)
	chunked_text = create_chunked_text(text)
	embeddings = generate_embeddings_large.(chunked_text)
	mean(embeddings)
	end

	"""
	# Get distance and classification

	ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame)
	ts_embed = dfdat_to_matrix(ts, :Embeddings)
	fc = CSV.read("data/fact_check_latest_embed.csv", DataFrame)
	fc_embed = dfdat_to_matrix(fc, :Embeddings)
	distances, classification = distances_and_classification(fc_embed, ts_embed[:, 1:5])
	"""
	function distances_and_classification(narrative_matrix, target_matrix)
	distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2)
	# get the index of the column with the smallest distance
	return distances[argmin(distances, dims=2)][:, 1], argmin(distances, dims=2)[:, 1]
	end

	"""
	# Get the dot product of the two matrices

	ind, scores = dotproduct_distances(fc_embed, ts_embed)

	ts.scores = scores

	# Group by target and get the max score
	ts_grouped = combine(groupby(ts, :target), :scores => mean)
	# show the matched text
	ts.fc_text = fc.text[ind]

	"""
	function dotproduct_distances(narrative_matrix, target_matrix)
	# multiply each column of the narrative matrix by the target vector
	dprods = narrative_matrix' * target_matrix
	# get maximum dotproduct and index of the row
	max_dot = argmax(dprods, dims=1)[1, :]
	return first.(Tuple.(max_dot)), dprods[max_dot]
	end

	function dotproduct_topk(narrative_matrix, target_vector, k)
	# multiply each column of the narrative matrix by the target vector
	dprods = narrative_matrix' * target_vector
	# indices of the top k dot products
	topk = sortperm(dprods, rev=true)[1:k]
	return topk, dprods[topk]
	end

	"""
	# Get the top k scores

	using CSV, DataFrames
	ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame)
	ts_embed = OC.dfdat_to_matrix(ts, :Embeddings)
	fc = CSV.read("data/fact_check_latest_embed_maxi.csv", DataFrame)
	fc_embed = OC.dfdat_to_matrix(fc, :Embeddings)

	OC.fast_topk(fc_embed, fc, ts.statement[1], 5)

	## How fast to get the top 5 scores for 3K statements?
	@time [OC.fast_topk(fc_embed, fc, ts.statement[x], 5) for x in 1:3000] # 63 seconds
	"""
	function fast_topk(narrative_matrix, narratives, text::String, k)
	#target_vector = mini_embed(text)
	target_vector = maxi_embed(text)
	inds, scores = dotproduct_topk(narrative_matrix, target_vector, k)
	if hasproperty(narratives, :Policy)
	policy = narratives.Policy[inds]
	narrative = narratives.Narrative[inds]
	else
	policy = fill("No policy", k)
	narrative = fill("No narrative", k)
	end
	if !hasproperty(narratives, :claimReviewUrl)
	narratives.claimReviewUrl = fill("No URL", size(narratives, 1))
	end
	vec_of_dicts = [Dict("score" => scores[i],
	"text" => narratives.text[ind],
	"claimUrl" => narratives.claimReviewUrl[ind],
	"policy" => policy[i],
	"narrative" => narrative[i]) for (i, ind) in enumerate(inds)]
	return vec_of_dicts
	end

	function load_fasttext_embeddings(file::String="data/fact_check_latest_embed.csv")
	fc = CSV.read(file, DataFrame)
	fc_embed = dfdat_to_matrix(fc, :Embeddings)
	return fc_embed, fc
	end