Spaces:
Sleeping
Sleeping
| ## Embeddings | |
| function string_to_float32_vector(str::String)::Vector{Float32} | |
| # Remove the "Float32[" prefix and the "]" suffix | |
| str = strip(str, ['F', 'l', 'o', 'a', 't', '3', '2', '[', ']']) | |
| # Replace 'f' with 'e' for scientific notation | |
| str = replace(str, 'f' => 'e') | |
| # Split the string by commas to get individual elements | |
| elements = split(str, ",") | |
| # Convert each element to Float32 and collect into a vector | |
| return Float32[parse(Float32, strip(el)) for el in elements] | |
| end | |
| function dfdat_to_matrix(df::DataFrame, col::Symbol)::Matrix{Float32} | |
| return hcat([string_to_float32_vector(row[col]) for row in eachrow(df)]...) | |
| end | |
| """ | |
| ## Any piece of text longer than 280 characters will be chunked into smaller pieces, and the embeddings will be averaged. | |
| #Example: | |
| text = repeat("This is a test. ", 100) | |
| chunktext = create_chunked_text(text) | |
| function create_chunked_text(text; chunk_size=280) | |
| ## Chunk the data | |
| chunks = [] | |
| for chunk in 1:chunk_size:length(text) | |
| push!(chunks, text[chunk:min(chunk+chunk_size-1, length(text))]) | |
| end | |
| return chunks | |
| end | |
| """ | |
| function create_chunked_text(text::String; chunk_size::Int=280) | |
| chunks = [] | |
| start_idx = 1 | |
| while start_idx <= lastindex(text) | |
| end_idx = start_idx | |
| for _ in 1:chunk_size | |
| end_idx = nextind(text, end_idx, 1) | |
| if end_idx > lastindex(text) | |
| end_idx = lastindex(text) | |
| break | |
| end | |
| end | |
| push!(chunks, text[start_idx:end_idx]) | |
| start_idx = nextind(text, end_idx) | |
| end | |
| return chunks | |
| end | |
| """ | |
| ## Embeddings of text from the small encoder | |
| text = "This is a test." | |
| using OstreaCultura: Encoder | |
| embd = Encoder.get_embeddings(text) | |
| """ | |
| function generate_embeddings(text::String) | |
| try | |
| return Encoder.get_embeddings(text) | |
| catch e | |
| println("Error: ", e) | |
| return zeros(Float32, 384) | |
| end | |
| end | |
| """ | |
| ## Embeddings of text from the large encoder | |
| text = "This is a test." | |
| using OstreaCultura: Encoder | |
| embd = Encoder.get_embeddings_big(text) | |
| LinearAlgebra.normalize(embd) | |
| """ | |
| function generate_embeddings_large(text::String) | |
| try | |
| return Encoder.get_embeddings_big(text) | |
| catch e | |
| println("Error: ", e) | |
| return zeros(Float32, 768) | |
| end | |
| end | |
| """ | |
| # This is the core function - takes in a string of any length and returns the embeddings | |
| text = repeat("This is a test. ", 100) | |
| mini_embed(text) | |
| # Test to embed truthseeker subsample | |
| ts = CSV.read("data/truthseeker_subsample.csv", DataFrame) | |
| ts_embed = mini_embed.(ts.statement) # can embed 3K in 25 seconds | |
| ts.Embeddings = ts_embed | |
| CSV.write("data/truthseeker_subsample_embed.csv", ts) | |
| ## embed fact check data | |
| fc = CSV.read("data/fact_check_latest.csv", DataFrame) | |
| # drop missing text | |
| fc = fc[.!ismissing.(fc.text), :] | |
| fc_embed = mini_embed.(fc.text) # 12 minutes | |
| fc.Embeddings = fc_embed | |
| CSV.write("data/fact_check_latest_embed.csv", fc) | |
| narrs = CSV.read("data/expansive_claims_library_expanded.csv", DataFrame) | |
| # drop missing text | |
| narrs.text = narrs.ExpandedClaim | |
| narrs = narrs[.!ismissing.(narrs.text), :] | |
| narratives_embed = OC.mini_embed.(narrs.text) # seconds to run | |
| narrs.Embeddings = narratives_embed | |
| CSV.write("data/expansive_claims_library_expanded_embed.csv", narrs) | |
| """ | |
| function mini_embed(text::String) | |
| chunked_text = create_chunked_text(text) | |
| embeddings = generate_embeddings.(chunked_text) | |
| mean(embeddings) | |
| end | |
| function maxi_embed(text::String) | |
| chunked_text = create_chunked_text(text) | |
| embeddings = generate_embeddings_large.(chunked_text) | |
| mean(embeddings) | |
| end | |
| """ | |
| # Get distance and classification | |
| ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame) | |
| ts_embed = dfdat_to_matrix(ts, :Embeddings) | |
| fc = CSV.read("data/fact_check_latest_embed.csv", DataFrame) | |
| fc_embed = dfdat_to_matrix(fc, :Embeddings) | |
| distances, classification = distances_and_classification(fc_embed, ts_embed[:, 1:5]) | |
| """ | |
| function distances_and_classification(narrative_matrix, target_matrix) | |
| distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2) | |
| # get the index of the column with the smallest distance | |
| return distances[argmin(distances, dims=2)][:, 1], argmin(distances, dims=2)[:, 1] | |
| end | |
| """ | |
| # Get the dot product of the two matrices | |
| ind, scores = dotproduct_distances(fc_embed, ts_embed) | |
| ts.scores = scores | |
| # Group by target and get the max score | |
| ts_grouped = combine(groupby(ts, :target), :scores => mean) | |
| # show the matched text | |
| ts.fc_text = fc.text[ind] | |
| """ | |
| function dotproduct_distances(narrative_matrix, target_matrix) | |
| # multiply each column of the narrative matrix by the target vector | |
| dprods = narrative_matrix' * target_matrix | |
| # get maximum dotproduct and index of the row | |
| max_dot = argmax(dprods, dims=1)[1, :] | |
| return first.(Tuple.(max_dot)), dprods[max_dot] | |
| end | |
| function dotproduct_topk(narrative_matrix, target_vector, k) | |
| # multiply each column of the narrative matrix by the target vector | |
| dprods = narrative_matrix' * target_vector | |
| # indices of the top k dot products | |
| topk = sortperm(dprods, rev=true)[1:k] | |
| return topk, dprods[topk] | |
| end | |
| """ | |
| # Get the top k scores | |
| using CSV, DataFrames | |
| ts = CSV.read("data/truthseeker_subsample_embed.csv", DataFrame) | |
| ts_embed = OC.dfdat_to_matrix(ts, :Embeddings) | |
| fc = CSV.read("data/fact_check_latest_embed_maxi.csv", DataFrame) | |
| fc_embed = OC.dfdat_to_matrix(fc, :Embeddings) | |
| OC.fast_topk(fc_embed, fc, ts.statement[1], 5) | |
| ## How fast to get the top 5 scores for 3K statements? | |
| @time [OC.fast_topk(fc_embed, fc, ts.statement[x], 5) for x in 1:3000] # 63 seconds | |
| """ | |
| function fast_topk(narrative_matrix, narratives, text::String, k) | |
| #target_vector = mini_embed(text) | |
| target_vector = maxi_embed(text) | |
| inds, scores = dotproduct_topk(narrative_matrix, target_vector, k) | |
| if hasproperty(narratives, :Policy) | |
| policy = narratives.Policy[inds] | |
| narrative = narratives.Narrative[inds] | |
| else | |
| policy = fill("No policy", k) | |
| narrative = fill("No narrative", k) | |
| end | |
| if !hasproperty(narratives, :claimReviewUrl) | |
| narratives.claimReviewUrl = fill("No URL", size(narratives, 1)) | |
| end | |
| vec_of_dicts = [Dict("score" => scores[i], | |
| "text" => narratives.text[ind], | |
| "claimUrl" => narratives.claimReviewUrl[ind], | |
| "policy" => policy[i], | |
| "narrative" => narrative[i]) for (i, ind) in enumerate(inds)] | |
| return vec_of_dicts | |
| end | |
| function load_fasttext_embeddings(file::String="data/fact_check_latest_embed.csv") | |
| fc = CSV.read(file, DataFrame) | |
| fc_embed = dfdat_to_matrix(fc, :Embeddings) | |
| return fc_embed, fc | |
| end |