Spaces:
Sleeping
Sleeping
| ## | |
| using CSV, JLD2, DataFrames, OpenAI, StatsBase, Distances, TidierPlots | |
| ## TODO: Base Functions | |
| # 1. Create a function to generate embeddings | |
| # 2. Create a function to get the distance to the closest claim, cut based on threshold | |
| # 3. Create a function to get the distance to the closest counterclaim, no cutting on threshold | |
| # 4. Create a function to compare the distance to the closest claim and counterclaim, assign label only if the distance to the closest claim is less than the distance to the closest counterclaim | |
| ## Analysis: | |
| # What is the distribution of distances by assigned narrative and label? | |
| ### UTILITIES #### | |
| # Define count function | |
| function table(df::DataFrame, cols::Vector{Symbol}) | |
| combine(groupby(df, cols), nrow) | |
| end | |
| ######### | |
| """ | |
| ## Embeddings to recover narratives | |
| narrative_embeddings = create_narrative_embeddings() | |
| """ | |
| function create_narrative_embeddings(regenerate=false) | |
| if !regenerate && isfile("data/narrative_embeddings.jld2") | |
| return load_object("data/narrative_embeddings.jld2") | |
| end | |
| narratives = CSV.read("data/Modified Misinformation Library.csv", DataFrame) | |
| ## narrative Embeddings | |
| n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], narratives[!, "Misinformation Narrative"]) | |
| ## Add vector of embeddings to dataset | |
| narratives[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]] | |
| # Save the embeddings | |
| save_object("data/narrative_embeddings.jld2", narratives) | |
| return narratives | |
| end | |
| """ | |
| # This is the testing data | |
| target_embeddings = create_test_embeddings() | |
| """ | |
| function create_test_embeddings(regenerate=false) | |
| if !regenerate && isfile("data/test_embeddings.jld2") | |
| return load_object("data/test_embeddings.jld2") | |
| end | |
| df_test = CSV.read("data/Indicator_Test.csv", DataFrame) | |
| ## narrative Embeddings | |
| n_embeddings = create_embeddings(ENV["OPENAI_API_KEY"], df_test[!, "text"]) | |
| ## Add vector of embeddings to dataset | |
| df_test[!, "Embeddings"] = [x["embedding"] for x in n_embeddings.response["data"]] | |
| # Save the embeddings | |
| save_object("data/test_embeddings.jld2", df_test) | |
| return df_test | |
| end | |
| """ | |
| ### The embeddings for each example are along the rows, so they can be compared column-wise (fast) | |
| narrative_embeddings = create_narrative_embeddings() | |
| target_embeddings = create_test_embeddings() | |
| one_shot_classification!(narrative_embeddings, target_embeddings) | |
| ## Show the results - text, closest narrative | |
| target_embeddings[:, ["text", "Closest Narrative", "label"]] |> first(5) | |
| """ | |
| function one_shot_classification!(narrative_embeddings, target_embeddings) | |
| ## Matrix of embeddings | |
| narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...) | |
| target_matrix = hcat(target_embeddings[:, "Embeddings"]...) | |
| # Create a search function | |
| function search(narrative_matrix, target_matrix) | |
| distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2) | |
| # get the index of the column with the smallest distance | |
| narrative_index = argmin(distances, dims=2) | |
| return narrative_index | |
| end | |
| # Search for the closest narrative for each test data | |
| narrative_assignment = search(narrative_matrix, target_matrix) | |
| target_embeddings[:, "Closest Narrative"] = [narrative_embeddings[x[2], "Misinformation Narrative"] for x in narrative_assignment[:, 1]] | |
| return target_embeddings | |
| end | |
| function get_distances!(narrative_embeddings, target_embeddings) | |
| ## Matrix of embeddings | |
| narrative_matrix = hcat(narrative_embeddings[:, "Embeddings"]...) | |
| target_matrix = hcat(target_embeddings[:, "Embeddings"]...) | |
| # Create a search function | |
| function embedding_distances(narrative_matrix, target_matrix) | |
| distances = pairwise(CosineDist(), target_matrix, narrative_matrix, dims=2) | |
| # get the index of the column with the smallest distance | |
| return distances[argmin(distances, dims=2)][:, 1] | |
| end | |
| # Search for the closest narrative for each test data | |
| target_embeddings[:, "Dist"] = embedding_distances(narrative_matrix, target_matrix) | |
| return target_embeddings | |
| end | |
| ## Add vector of embeddings to the test dataset | |
| # 3. Generate embeddings of the narratives in multiple languages | |
| # 4. Create a langchain search function to check which narrative is closest to the input narrative | |
| # 5. Figure out how effective the embeddings are in recovering the narrative classification | |
| ## STEPS::::: Models | |
| # 1. Within each of the classified narratives, reuse the embeddings to find the misinfo by selecting the top K matches | |
| # 2. Train a model on the embeddings to predict the misinfo | |
| # Get the embeddings for the narratives | |
| narrative_embeddings = create_narrative_embeddings() | |
| target_embeddings = create_test_embeddings() | |
| one_shot_classification!(narrative_embeddings, target_embeddings) | |
| get_distances!(narrative_embeddings, target_embeddings) | |
| # Plot the distribution of distances by narrative and label | |
| using TidierPlots | |
| ## By Label | |
| ggplot(target_embeddings, @aes(x = label, y = Dist)) + | |
| geom_violin() + labs(x="Misinfo Label", y="Distance") #+ geom_hline() | |
| ## By Narrative | |
| #ggplot(target_embeddings |> (data -> filter(:label => x -> x .== 1.0, data)), @aes(x = "Closest Narrative", y = Dist)) + | |
| # geom_violin() | |
| ### Assign MisinfoPred = true if distance is less than .2 | |
| target_embeddings[!, "MisinfoPred"] = target_embeddings[!, "Dist"] .< 0.2 | |
| ## Precision and Recall | |
| using MLJ | |
| y_true = target_embeddings[!, "label"] | |
| y_pred = target_embeddings[!, "MisinfoPred"] | |
| confusion_matrix(y_pred, y_true) | |
| accuracy(y_true, y_pred) | |
| true_positive_rate(y_true, y_pred) | |
| false_positive_rate(y_true, y_pred) | |
| ## Top 10 closest narratives | |
| target_embeddings |> | |
| (data -> filter(:label => x -> x .== 1.0, data)) |> | |
| (data -> sort(data, :Dist)) |> | |
| (data -> first(data, 10)) |> | |
| (data -> select(data, ["text", "Closest Narrative", "Dist"])) |