Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
favstats committed Dec 27, 2024
1 parent a25c11a commit f5c0d0e
Show file tree
Hide file tree
Showing 14 changed files with 581 additions and 63 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
.Rdata
.httr-oauth
.DS_Store
data
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: metatargetr
Title: Retrieve and Parse Meta Ad Targeting Data
Version: 0.0.2
Version: 0.0.5
Authors@R:
c(person("Fabio", "Votta", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "YOUR-ORCID-ID")),
Expand All @@ -11,7 +11,7 @@ License: MIT + file LICENSE
Config/testthat/edition: 3
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.2
Imports:
magrittr,
glue,
Expand All @@ -26,6 +26,7 @@ Imports:
tidyr
Suggests:
OpenImageR,
testthat (>= 3.0.0)
testthat (>= 3.0.0),
arrow
URL: https://github.com/favstats/metatargetr
BugReports: https://github.com/favstats/metatargetr/issues
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@ export(find_name)
export(fix_json)
export(get_ad_snapshots)
export(get_page_insights)
export(get_report_db)
export(get_targeting)
export(get_targeting_db)
export(ggl_get_spending)
export(map_dfr_progress)
export(parse_location)
export(stupid_conversion)
export(unnest_and_fix_dups)
export(walk_progress)
importFrom(arrow,read_parquet)
importFrom(dplyr,bind_cols)
importFrom(dplyr,left_join)
importFrom(dplyr,mutate_all)
Expand Down
187 changes: 187 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
#' Retrieve Targeting Data from GitHub Repository
#'
#' This function retrieves targeting data for a specific country and timeframe
#' from a GitHub repository hosting parquet files. The function uses the `arrow`
#' package to read the parquet file directly from the specified URL.
#'
#' @param the_cntry Character. The ISO country code (e.g., "DE", "US").
#' @param tf Numeric or character. The timeframe in days (e.g., "30" or "LAST_30_DAYS").
#' @param ds Character. A timestamp or identifier used to construct the file path (e.g., "2024-12-25").
#' @return A data frame containing the targeting data from the parquet file.
#' @importFrom arrow read_parquet
#' @export
#'
#' @examples
#' # Example usage
#' latest_data <- get_targeting_db(
#' the_cntry = "DE",
#' tf = 30,
#' ds = "2024-10-25"
#' )
#' print(head(latest_data))
get_targeting_db <- function(the_cntry, tf, ds, remove_nas = T, verbose = F) {
# Validate inputs
if (missing(the_cntry) || missing(tf) || missing(ds)) {
stop("All parameters (`the_cntry`, `tf`, `ds`) are required.")
}

# Construct the URL
url <- paste0(
"https://github.com/favstats/meta_ad_targeting/releases/download/",
the_cntry, # Country code
"-last_", tf, # Timeframe in days
"_days/", # Fixed URL segment
ds, # Date or identifier
".parquet" # File extension
)

if(verbose){
message("Constructed URL: ", url)
}
# Attempt to read the parquet file
tryCatch({
data <- arrow::read_parquet(url)
if(verbose){
message("Data successfully retrieved.")
}
if(remove_nas){
if("no_data" %in% names(data)){
data <- data %>% dplyr::filter(is.na(no_data))
if(verbose){
message("Missing data successfully removed.")
}
}
}
return(data)
}, error = function(e) {
stop("Failed to retrieve or parse the parquet file. Error: ", e$message)
})
}

# # Define example parameters
# the_cntry <- "DE"
# tf <- 30
# ds <- "2024-10-25"
#
# # Call the function
# latest_data <- get_targeting_db(the_cntry, tf, ds)
#
# # Inspect the data
# print(head(latest_data))
# library(tidyverse)
# latest_data %>% filter(is.na(no_data))



#' Retrieve Report Data from GitHub Repository
#'
#' This function retrieves a report for a specific country and timeframe
#' from a GitHub repository hosting RDS files. The file is downloaded
#' to a temporary location, read into R, and then deleted.
#'
#' @param the_cntry Character. The ISO country code (e.g., "DE", "US").
#' @param timeframe Character. One of "-yesterday", "-last_7_days", "-last_30_days", "-last_90_days", or "-lifelong".
#' @param file_name Character. The name of the RDS file to download (e.g., "report_2024-12-25.rds").
#' @param verbose Logical. Whether to print messages about the process. Default is `FALSE`.
#' @return A data frame or object read from the RDS file.
#' @export
#'
#' @examples
#' # Example usage
#' report_data <- get_report_db(
#' the_cntry = "DE",
#' timeframe = "-last_30_days",
#' file_name = "report_2024-12-25.rds",
#' verbose = TRUE
#' )
#' print(head(report_data))
#' Retrieve Report Data from GitHub Repository
#'
#' This function retrieves a report for a specific country and timeframe
#' from a GitHub repository hosting RDS files. The file is downloaded
#' to a temporary location, read into R, and then deleted.
#'
#' @param the_cntry Character. The ISO country code (e.g., "DE", "US").
#' @param timeframe Character or Numeric. Timeframe in days (e.g., "30", "90") or "yesterday" / "lifelong".
#' @param ds Character. A timestamp or identifier used to construct the file name (e.g., "2024-12-25").
#' @param verbose Logical. Whether to print messages about the process. Default is `FALSE`.
#' @return A data frame or object read from the RDS file.
#' @export
#'
#' @examples
#' # Example usage
#' report_data <- get_report_db(
#' the_cntry = "DE",
#' timeframe = 30,
#' ds = "2024-12-25",
#' verbose = TRUE
#' )
#' print(head(report_data))
get_report_db <- function(the_cntry, timeframe, ds, verbose = FALSE) {
# Validate inputs
if (missing(the_cntry) || missing(timeframe) || missing(ds)) {
stop("All parameters (`the_cntry`, `timeframe`, `ds`) are required.")
}

# Construct the timeframe string
if (is.numeric(timeframe)) {
tf_string <- paste0("-last_", timeframe, "_days")
} else if (timeframe %in% c("yesterday", "lifelong")) {
tf_string <- paste0("-", timeframe)
} else {
stop("Invalid `timeframe` value. Must be numeric (e.g., 30, 90) or 'yesterday' / 'lifelong'.")
}

# Construct the file name
file_name <- paste0(ds, ".rds")

# Construct the URL
url <- paste0(
"https://github.com/favstats/meta_ad_reports/releases/download/",
the_cntry, tf_string, "/",
file_name
)

# Temporary file path
temp_file <- tempfile(fileext = ".rds")

if (verbose) {
message("Constructed URL: ", url)
message("Downloading to temporary file: ", temp_file)
}

# Attempt to download and read the RDS file
tryCatch({
download.file(url, destfile = temp_file, mode = "wb")
if (verbose) {
message("File successfully downloaded.")
}

# Read the RDS file
data <- readRDS(temp_file)
if (verbose) {
message("Data successfully read from the RDS file.")
}

# Return the data
return(data)
}, error = function(e) {
stop("Failed to retrieve or parse the RDS file. Error: ", e$message)
}, finally = {
# Ensure the temporary file is deleted
if (file.exists(temp_file)) {
file.remove(temp_file)
if (verbose) {
message("Temporary file deleted.")
}
}
})
}


# report_data <- get_report_db(
# the_cntry = "DE",
# timeframe = 7,
# ds = "2024-10-25",
# verbose = TRUE
# )
103 changes: 89 additions & 14 deletions R/get_page_insights.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,30 +42,105 @@ ua_list <- c(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
)

if("page_info" %in% include_info ){
include_page_info <- T
} else {
include_page_info <- F

}

ua <- sample(ua_list, 1)
# print(ua)

# pageid <- "7860876103"
# timeframe <- "90"

fetch_page_info <- ifelse("page_info" %in% include_info, "true", "false")
# Define static parameters
static_params <- list(
av = "0", # Likely application version; may not change often. Optional.
"_aaid" = "0", # Anonymous Advertising ID; unique to the visitor. Required for tracking purposes.
user = "0", # User identifier; likely session-based or placeholder. Required.
a = "1", # Arbitrary request parameter; purpose unclear but likely required.
req = "3", # Request parameter; often a sequence or batch request identifier. Likely required.
hs = "19797.BP%3ADEFAULT.2.0..0.0", # Host session or configuration metadata; required for server-side routing.
# dpr = "1", # Device Pixel Ratio; reflects screen resolution. Optional but often included.
ccg = "EXCELLENT", # Connection grade; describes network quality. Optional but useful for server-side optimizations.
# rev = "1012093869", # Revision/version number; likely application or API version. Required.
# s = "sbbnic%3Awquopy%3A7r1j3c", # Session token or tracking identifier; unique to the visitor. Required.
# hsi = "7346737420686302672", # Hashed Session ID; unique to the visitor. Required.
# dyn = "7xe6Eiw_K9zo5ObwKBAgc9o2exu13wqojyUW3qi4EoxW4E7SewXwCwfW7oqx60Vo1upEK12wvk1bwbG78b87C2m3K2y11wBw5Zx62G3i1ywdl0Fw4Hwp8kwyx2cU8EmwoHwrUcUjwVw9O7bK2S2W2K4EG1Mxu16wciaw4JwJwSyES0gq0K-1LwqobU2cwmo6O1Fw44wt8",
# Dynamic parameters encoded in a proprietary format; likely unique to each request. Required.

csr = "", # CSRF token; placeholder here, likely required in some contexts.
# lsd = "AVo6-wl7l1Q", # Login session data; required for session validation.
# jazoest = "2881", # CSRF-related field; required for security checks.
# spin_r = "1012093869", # Spin-related metadata (server-specific session management). Required.
# spin_b = "trunk", # Backend branch/version. Required for routing to the correct API version.
# spin_t = "1710545602", # Server timestamp. Required for ensuring request freshness.
`_jssesw` = "1", # Encoded session value. Required for session management.
fb_api_caller_class = "RelayModern", # API metadata describing the client. Required.
fb_api_req_friendly_name = "AdLibraryMobileFocusedStateProviderQuery", # API-friendly name for request logging. Optional.
server_timestamps = "true", # Flag indicating server timestamps should be included. Likely required.
doc_id = "7193625857423421" # Unique document ID for the query. Required for identifying the request schema.
)

# Construct variables
variables <- jsonlite::toJSON(
list(
adType = "POLITICAL_AND_ISSUE_ADS", # Type of ads to query. Required.
audienceTimeframe = timeframe, # Timeframe for audience data (e.g., LAST_30_DAYS). Required.
country = iso2c, # Country ISO code (e.g., "DE"). Required.
viewAllPageID = pageid, # Page ID for which data is being queried. Required.
fetchPageInfo = include_page_info, # Boolean flag to fetch page-specific information. Optional.
fetchSharedDisclaimers = TRUE, # Boolean flag to include shared disclaimers. Optional but useful.
active_status = "ALL", # Filter for active/inactive ads. Required.
ad_type = "POLITICAL_AND_ISSUE_ADS", # Type of ads (repeated for clarity). Required.
bylines = list(), # List of bylines to filter ads. Optional.
# collation_token = "7ca3912f-0148-43ce-83e4-9a68ef656e4d",
# Unique token for grouping or collation; may be session-based. Likely required.

content_languages = list(), # Filter for content languages. Optional.
count = 30, # Number of results to fetch. Optional but usually required for pagination.
countries = list(iso2c), # List of countries for filtering (repeated for clarity). Required.
excluded_ids = list(), # IDs to exclude from results. Optional.
full_text_search_field = "ALL", # Full-text search field filter. Optional.
group_by_modes = list(), # Grouping modes for results. Optional.
search_type = "PAGE", # Type of search (e.g., by page). Required.
# session_id = "1678877b-700b-485a-abb0-60efcb6b4019",
# Unique session identifier for the query. Required for tracking.

sort_data = list(
mode = "SORT_BY_RELEVANCY_MONTHLY_GROUPED", # Sorting mode. Required.
direction = "ASCENDING" # Sorting direction. Required.
)
),
auto_unbox = TRUE
)

static_params$variables <- URLencode(variables)
body <- paste(names(static_params), static_params, sep = "=", collapse = "&")

# print("Constructed body:")
# print(body)

resp <- request("https://www.facebook.com/api/graphql/") %>%
httr2::req_headers(
`Accept-Language` = paste0(lang, ',', stringr::str_split(lang, "-") %>% unlist() %>% .[1],';q=0.5'),
req_headers(
`Accept-Language` = paste0(
lang, ",", stringr::str_split(lang, "-") %>% unlist() %>% .[1], ";q=0.5"
),
`sec-fetch-site` = "same-origin",
`user-agent` = ua
) %>%
httr2::req_body_raw(glue::glue("av=0&_aaid=0&user=0&a=1&req=3&hs=19797.BP%3ADEFAULT.2.0..0.0&dpr=1&ccg=EXCELLENT&rev=1012093869&s=sbbnic%3Awquopy%3A7r1j3c&hsi=7346737420686302672&dyn=7xe6Eiw_K9zo5ObwKBAgc9o2exu13wqojyUW3qi4EoxW4E7SewXwCwfW7oqx60Vo1upEK12wvk1bwbG78b87C2m3K2y11wBw5Zx62G3i1ywdl0Fw4Hwp8kwyx2cU8EmwoHwrUcUjwVw9O7bK2S2W2K4EG1Mxu16wciaw4JwJwSyES0gq0K-1LwqobU2cwmo6O1Fw44wt8&csr=&lsd=AVo6-wl7l1Q&jazoest=2881&spin_r=1012093869&spin_b=trunk&spin_t=1710545602&_jssesw=1&fb_api_caller_class=RelayModern&fb_api_req_friendly_name=AdLibraryMobileFocusedStateProviderQuery&variables=%7B%22adType%22%3A%22POLITICAL_AND_ISSUE_ADS%22%2C%22audienceTimeframe%22%3A%22{timeframe}%22%2C%22country%22%3A%22{iso2c}%22%2C%22viewAllPageID%22%3A%22{pageid}%22%2C%22fetchPageInfo%22%3A{fetch_page_info}%2C%22fetchSharedDisclaimers%22%3Atrue%2C%22active_status%22%3A%22ALL%22%2C%22ad_type%22%3A%22POLITICAL_AND_ISSUE_ADS%22%2C%22bylines%22%3A%5B%5D%2C%22collation_token%22%3A%227ca3912f-0148-43ce-83e4-9a68ef656e4d%22%2C%22content_languages%22%3A%5B%5D%2C%22count%22%3A30%2C%22countries%22%3A%5B%22{iso2c}%22%5D%2C%22excluded_ids%22%3A%5B%5D%2C%22full_text_search_field%22%3A%22ALL%22%2C%22group_by_modes%22%3A%5B%5D%2C%22image_id%22%3Anull%2C%22location%22%3Anull%2C%22media_type%22%3A%22ALL%22%2C%22page_ids%22%3A%5B%5D%2C%22pagination_mode%22%3Anull%2C%22potential_reach_input%22%3Anull%2C%22publisher_platforms%22%3A%5B%5D%2C%22query_string%22%3A%22%22%2C%22regions%22%3A%5B%5D%2C%22search_type%22%3A%22PAGE%22%2C%22session_id%22%3A%221678877b-700b-485a-abb0-60efcb6b4019%22%2C%22sort_data%22%3A%7B%22mode%22%3A%22SORT_BY_RELEVANCY_MONTHLY_GROUPED%22%2C%22direction%22%3A%22ASCENDING%22%7D%2C%22source%22%3Anull%2C%22start_date%22%3Anull%2C%22view_all_page_id%22%3A%22{pageid}%22%7D&server_timestamps=true&doc_id=7193625857423421"), "application/x-www-form-urlencoded") %>%
httr2::req_perform()
req_body_raw(body, "application/x-www-form-urlencoded") %>%
req_perform()

out <- resp %>%
httr2::resp_body_html() %>%
rvest::html_element("p") %>%
rvest::html_text() %>%
str_split_1('(?<=\\})\\s*(?=\\{)') %>%
map(jsonlite::fromJSON)
stringr::str_split_1('(?<=\\})\\s*(?=\\{)') %>%
purrr::map(jsonlite::fromJSON)

if(!is.null(out[[1]][["errors"]][["description"]])){
message(out[[1]][["errors"]][["description"]])
Expand All @@ -79,14 +154,14 @@ if( "page_info" %in% include_info) {

if ("page_info" %in% include_info & "targeting_info" %in% include_info) {
if (join_info) {
return(tibble(page_id = pageid, no_data = T))
return(tibble::tibble(page_id = pageid, no_data = T))
} else {
return(list(page_info = tibble(page_id = pageid, no_data = T),
targeting_info = tibble(page_id = pageid, no_data = T)))
return(list(page_info = tibble::tibble(page_id = pageid, no_data = T),
targeting_info = tibble::tibble(page_id = pageid, no_data = T)))
}

} else {
return(tibble(page_id = pageid, no_data = T))
return(tibble::tibble(page_id = pageid, no_data = T))
}


Expand All @@ -105,7 +180,7 @@ if( "page_info" %in% include_info) {
dplyr::mutate_all(as.character) %>%
dplyr::mutate(shared_disclaimer_info = my_dataframe$page_id[1])
} else {
page_info2 <- tibble(no_shared_disclaimer = T)
page_info2 <- tibble::tibble(no_shared_disclaimer = T)
}


Expand All @@ -127,7 +202,7 @@ if( "page_info" %in% include_info) {
if(!is.null(address_raw)){
address <- address_raw %>% purrr::flatten()
} else {
address <- tibble(no_address = T)
address <- tibble::tibble(no_address = T)
}


Expand Down Expand Up @@ -194,7 +269,7 @@ if( "page_info" %in% include_info & "targeting_info" %in% include_info ) {

if(join_info){
fin <- page_info %>%
left_join(targeting_info, by = "page_id")
dplyr::left_join(targeting_info, by = "page_id")
} else {
fin <- list(page_info, targeting_info)
}
Expand Down
Loading

0 comments on commit f5c0d0e

Please sign in to comment.