Spaces:
Sleeping
Sleeping
| ### | |
| using HTTP | |
| using JSON3, DataFrames, ProgressMeter, Dates | |
| const fact_check_sources = ["nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com"] | |
| const query_categories = ["climate change", "jewish people", "black people", "immigration", "LGBTQ", "sexual and reproductive health"] | |
| """ | |
| ## Search Google Fact Check API | |
| ## API specs here: | |
| https://developers.google.com/fact-check/tools/api/reference/rest/v1alpha1/claims/search | |
| ## Example: | |
| response = search_claims(languageCode="en-US", reviewPublisherSiteFilter="politifact.com", maxAgeDays=7, pageSize=20, pageToken="", offset=0) | |
| responsenyt = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=700, pageSize=20) | |
| response = search_claims(query = "climate change", languageCode="en-US", maxAgeDays=1, pageSize=200) | |
| """ | |
| function search_claims(; | |
| query::String = "", | |
| languageCode::String = "en-US", # bcp-47 language code | |
| reviewPublisherSiteFilter::String = "", | |
| maxAgeDays::Int = 7, | |
| pageSize::Int = 20, | |
| pageToken::String = "", | |
| offset::Int = 0) | |
| # Prepare the base URL | |
| url = "https://factchecktools.googleapis.com/v1alpha1/claims:search" | |
| # Build query parameters | |
| params = Dict("key" => ENV["GOOGLECLOUD"]) | |
| if !isempty(query) | |
| params["query"] = query | |
| end | |
| if !isempty(languageCode) | |
| params["languageCode"] = languageCode | |
| end | |
| if !isempty(reviewPublisherSiteFilter) | |
| params["reviewPublisherSiteFilter"] = reviewPublisherSiteFilter | |
| end | |
| if maxAgeDays > 0 | |
| params["maxAgeDays"] = string(maxAgeDays) | |
| end | |
| if pageSize != 10 | |
| params["pageSize"] = string(pageSize) | |
| end | |
| if !isempty(pageToken) | |
| params["pageToken"] = pageToken | |
| elseif offset > 0 | |
| params["offset"] = string(offset) | |
| end | |
| # Make the HTTP GET request | |
| response = HTTP.get(url, query=params) | |
| # Parse the JSON response | |
| return JSON3.read(response.body) | |
| end | |
| """ | |
| ## Convert the search response to a tabular format | |
| qu= "Video shows Kamala (Harris) responding to someone" | |
| response = search_claims(query = qu, languageCode="en-US", maxAgeDays=700, pageSize=20) | |
| searchresponse_to_tabular(response) | |
| """ | |
| function searchresponse_to_tabular(response::JSON3.Object)::DataFrame | |
| # Initialize the results array | |
| try | |
| results = DataFrame( | |
| text = String[get(x, :text, "") for x in response.claims], | |
| claimant = String[get(x, :claimant, "") for x in response.claims], | |
| claimDate = String[get(x, :claimDate, "") for x in response.claims], | |
| claimReviewPublisher = String[get(x[:claimReview][1], "publisher", Dict())["site"] for x in response.claims], | |
| claimReviewTitle = String[get(x[:claimReview][1], "title", "") for x in response.claims], | |
| claimReviewTextualRating = String[get(x[:claimReview][1], "textualRating", "") for x in response.claims], | |
| claimReviewUrl = String[get(x[:claimReview][1], "url", "") for x in response.claims]) | |
| return results | |
| catch | |
| return DataFrame() | |
| end | |
| end | |
| """ | |
| ## Paginate Google Fact Check API results | |
| use the pageToken to get the next page of results | |
| ## NOTES: | |
| - 'reviewPublisherSiteFilter' is a string that filters by the review publisher site. You can use things like: "nytimes.com", "washingtonpost.com", "politifact.com", "snopes.com", etc. | |
| - If you have reviewPublisherSiteFilter, then query can be empty. | |
| """ | |
| function paginate_claims(; | |
| query::String = "", | |
| languageCode::String = "en-US", # bcp-47 language code | |
| reviewPublisherSiteFilter::String = "", | |
| maxAgeDays::Int = 7, | |
| pageSize::Int = 20, | |
| pageToken::String = "", | |
| offset::Int = 0, | |
| delay::Float64 = 1/(300/60)) # allows reqs per minute = 300 | |
| # Initialize the results array | |
| results = [] | |
| # Get the first page of results | |
| response = search_claims(query=query, | |
| languageCode=languageCode, | |
| reviewPublisherSiteFilter=reviewPublisherSiteFilter, | |
| maxAgeDays=maxAgeDays, | |
| pageSize=pageSize, | |
| pageToken=pageToken, | |
| offset=offset) | |
| push!(results, response) | |
| # Get the next page of results | |
| while haskey(response, "nextPageToken") | |
| sleep(delay) | |
| pageToken = response["nextPageToken"] | |
| response = search_claims(query=query, | |
| languageCode=languageCode, | |
| reviewPublisherSiteFilter=reviewPublisherSiteFilter, | |
| maxAgeDays=maxAgeDays, | |
| pageSize=pageSize, | |
| pageToken=pageToken, | |
| offset=offset) | |
| push!(results, response) | |
| end | |
| return results | |
| end | |
| """ | |
| # script to daily check for new fact-checks for each category | |
| allfacts = periodic_fact_check(365*8) | |
| ## Save the results to a CSV file | |
| using CSV, Dates | |
| CSV.write("data/google_fact_checks$(today()).csv", allfacts) | |
| allfacts= filter(:claimReviewTextualRating => x -> !contains(x, r" accurate| true"), allfacts) | |
| CSV.write("data/google_fact_checks2024-11-14.csv", allfacts) | |
| """ | |
| function periodic_fact_check(max_days::Int = 1) | |
| allresults = DataFrame[] | |
| for category in query_categories | |
| println("getting Category: $category") | |
| paginated_results = paginate_claims(query = category, languageCode="en-US", maxAgeDays=max_days, pageSize=200) | |
| if any(haskey.(paginated_results, "claims")) | |
| results = [searchresponse_to_tabular(page) for page in paginated_results] | |
| ## concat the results | |
| results = vcat(results...) | |
| results[!, :category] .= category | |
| push!(allresults, results) | |
| end | |
| end | |
| return vcat(allresults...) | |
| end | |
| function get_latest_fact_checks() | |
| download("https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json", "data/fact_check_latest.json") | |
| end | |
| """ | |
| d = Dict( | |
| :author => Dict( | |
| :name => "John Doe" | |
| ) | |
| ) | |
| safe_get(d, (:author, :name), "No name") | |
| """ | |
| function safe_get(dict::Dict, keys::Tuple, default=nothing) | |
| current = dict | |
| for key in keys | |
| if haskey(current, key) | |
| current = current[key] | |
| else | |
| return default | |
| end | |
| end | |
| return current | |
| end | |
| function safe_datetime(date::Union{DateTime, Missing}) | |
| return date | |
| end | |
| ## Convert date string to DateTime object without throwing an error | |
| function safe_datetime(date::String) | |
| try | |
| return Dates.DateTime(date) | |
| catch | |
| try | |
| Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SS.sssZ") | |
| catch | |
| try | |
| Dates.DateTime(date, dateformat"yyyy-mm-ddTHH:MM:SSZ") | |
| catch | |
| ## If all attempts fail | |
| return missing | |
| end | |
| end | |
| end | |
| end | |
| """ | |
| ## Load the entire fact check JSON file | |
| - the updated dataset can be found at: https://storage.googleapis.com/datacommons-feeds/claimreview/latest/data.json | |
| df, errors = load_fact_check_json() | |
| """ | |
| function load_fact_check_json(file::String="data/fact_check_latest.json"; get_latest=false) | |
| if get_latest | |
| get_latest_fact_checks() | |
| end | |
| df = JSON3.read(file) | |
| dfout = DataFrame[] | |
| errors = 0 | |
| error_index = Int64[] | |
| for (i, data) in enumerate(df[:dataFeedElement]) | |
| try | |
| d = Dict(data[:item][1]) | |
| results = DataFrame( | |
| text = get(d, :claimReviewed, ""), | |
| claimant = safe_get(d, (:itemReviewed, :author, :name), ""), | |
| claimDate = safe_get(d, (:itemReviewed, :datePublished), ""), | |
| claimReviewDate = get(d, :datePublished, ""), | |
| claimReviewPublisher = get(d[:author], :name, ""), | |
| claimReviewTitle = "", | |
| claimReviewTextualRating = safe_get(d, (:reviewRating, :alternateName), ""), | |
| claimReviewUrl = get(data, :url, "") | |
| ) | |
| push!(dfout, results) | |
| catch | |
| push!(error_index, i) | |
| errors += 1 | |
| end | |
| end | |
| return (vcat(dfout...), error_index) | |
| end | |
| """ | |
| ## Format the date columns in the DataFrame | |
| - drop rows where both date columns are missing | |
| df, errors = load_fact_check_json("data/fact_check_latest.json") | |
| format_date_cols!(df) | |
| """ | |
| function format_date_cols!(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate]) | |
| # Drop where date = "" | |
| for col in date_cols | |
| df[!, col] = safe_datetime.(df[!, col]) | |
| end | |
| # Drop if both date columns are missing | |
| df[:, :contains_date] = (ismissing.(df.claimDate) .& ismissing.(df.claimReviewDate)) .== false | |
| subset!(df, :contains_date) | |
| end | |
| """ | |
| ## Gets the latest date in the DataFrame from current date columns | |
| - used to identify the latest fact-checks in the datasets | |
| df, errs = load_fact_check_json("data/fact_check_latest.json") | |
| get_latest_date(df) | |
| """ | |
| function get_latest_date(df::DataFrame, date_cols::Vector{Symbol}=[:claimDate, :claimReviewDate]) | |
| if typeof(df.claimDate[1]) == String | |
| format_date_cols!(df) | |
| end | |
| ## Identify any dates that are in the future - must be miscoded, set to missing | |
| for col in date_cols | |
| df[!, col] = [ coalesce(x, Dates.today()) > Dates.today() ? missing : x for x in df[!, col]] | |
| end | |
| maxdates = [maximum(coalesce.(df[!, col], Date(1901, 1, 1))) for col in date_cols] | |
| maxcolumn = date_cols[argmax(maxdates)] | |
| return maximum(maxdates), maxcolumn | |
| end | |
| """ | |
| ## Identify the fact-checks in the latest dataset that are not in the previous dataset | |
| - use claimReviewDate to identify differences | |
| - get the latest claimReviewDate in current_data | |
| - get the latest claimReviewDate in previous_data | |
| - select the rows in current_data where claimReviewDate > latest_claimReviewDate | |
| Example: | |
| previous_data, errs = load_fact_check_json("data/fact_check_latest.json") | |
| current_data, errs = load_fact_check_json("data/fact_check_latest.json", get_latest=true) | |
| CSV.write("data/fact_check_latest.csv", current_data) | |
| new_fact_checks = get_new_fact_checks(current_data, previous_data) | |
| """ | |
| function get_new_fact_checks(current_data::DataFrame, previous_data::DataFrame) | |
| latest_of_newest, datecol = get_latest_date(current_data) | |
| latest_of_previous, datecol = get_latest_date(previous_data) | |
| # Get the indices of the new fact-checks | |
| if latest_of_newest > latest_of_previous | |
| return current_data[coalesce.(current_data[!, datecol], Date(1901, 1, 1)) .> latest_of_previous, :] | |
| else | |
| return DataFrame() | |
| end | |
| end | |