update

favstats · Dec 27, 2024 · f5c0d0e · f5c0d0e
1 parent a25c11a
commit f5c0d0e
Show file tree

Hide file tree

Showing 14 changed files with 581 additions and 63 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@
 .Rdata
 .httr-oauth
 .DS_Store
+data
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: metatargetr
 Title: Retrieve and Parse Meta Ad Targeting Data
-Version: 0.0.2
+Version: 0.0.5
 Authors@R: 
     c(person("Fabio", "Votta", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "YOUR-ORCID-ID")),
@@ -11,7 +11,7 @@ License: MIT + file LICENSE
 Config/testthat/edition: 3
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Imports: 
     magrittr,
     glue,
@@ -26,6 +26,7 @@ Imports:
     tidyr
 Suggests:
     OpenImageR,
-    testthat (>= 3.0.0)
+    testthat (>= 3.0.0),
+    arrow
 URL: https://github.com/favstats/metatargetr
 BugReports: https://github.com/favstats/metatargetr/issues
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,13 +6,16 @@ export(find_name)
 export(fix_json)
 export(get_ad_snapshots)
 export(get_page_insights)
+export(get_report_db)
 export(get_targeting)
+export(get_targeting_db)
 export(ggl_get_spending)
 export(map_dfr_progress)
 export(parse_location)
 export(stupid_conversion)
 export(unnest_and_fix_dups)
 export(walk_progress)
+importFrom(arrow,read_parquet)
 importFrom(dplyr,bind_cols)
 importFrom(dplyr,left_join)
 importFrom(dplyr,mutate_all)

diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,187 @@
+#' Retrieve Targeting Data from GitHub Repository
+#'
+#' This function retrieves targeting data for a specific country and timeframe
+#' from a GitHub repository hosting parquet files. The function uses the `arrow`
+#' package to read the parquet file directly from the specified URL.
+#'
+#' @param the_cntry Character. The ISO country code (e.g., "DE", "US").
+#' @param tf Numeric or character. The timeframe in days (e.g., "30" or "LAST_30_DAYS").
+#' @param ds Character. A timestamp or identifier used to construct the file path (e.g., "2024-12-25").
+#' @return A data frame containing the targeting data from the parquet file.
+#' @importFrom arrow read_parquet
+#' @export
+#'
+#' @examples
+#' # Example usage
+#' latest_data <- get_targeting_db(
+#'   the_cntry = "DE",
+#'   tf = 30,
+#'   ds = "2024-10-25"
+#' )
+#' print(head(latest_data))
+get_targeting_db <- function(the_cntry, tf, ds, remove_nas = T, verbose = F) {
+    # Validate inputs
+    if (missing(the_cntry) || missing(tf) || missing(ds)) {
+        stop("All parameters (`the_cntry`, `tf`, `ds`) are required.")
+    }
+
+    # Construct the URL
+    url <- paste0(
+        "https://github.com/favstats/meta_ad_targeting/releases/download/",
+        the_cntry,        # Country code
+        "-last_", tf,     # Timeframe in days
+        "_days/",         # Fixed URL segment
+        ds,               # Date or identifier
+        ".parquet"        # File extension
+    )
+
+    if(verbose){
+        message("Constructed URL: ", url)
+    }
+    # Attempt to read the parquet file
+    tryCatch({
+        data <- arrow::read_parquet(url)
+        if(verbose){
+            message("Data successfully retrieved.")
+        }
+        if(remove_nas){
+            if("no_data" %in% names(data)){
+                data <- data %>% dplyr::filter(is.na(no_data))
+                if(verbose){
+                    message("Missing data successfully removed.")
+                }
+            }
+        }
+        return(data)
+    }, error = function(e) {
+        stop("Failed to retrieve or parse the parquet file. Error: ", e$message)
+    })
+}
+
+# # Define example parameters
+# the_cntry <- "DE"
+# tf <- 30
+# ds <- "2024-10-25"
+#
+# # Call the function
+# latest_data <- get_targeting_db(the_cntry, tf, ds)
+#
+# # Inspect the data
+# print(head(latest_data))
+# library(tidyverse)
+# latest_data %>% filter(is.na(no_data))
+
+
+
+#' Retrieve Report Data from GitHub Repository
+#'
+#' This function retrieves a report for a specific country and timeframe
+#' from a GitHub repository hosting RDS files. The file is downloaded
+#' to a temporary location, read into R, and then deleted.
+#'
+#' @param the_cntry Character. The ISO country code (e.g., "DE", "US").
+#' @param timeframe Character. One of "-yesterday", "-last_7_days", "-last_30_days", "-last_90_days", or "-lifelong".
+#' @param file_name Character. The name of the RDS file to download (e.g., "report_2024-12-25.rds").
+#' @param verbose Logical. Whether to print messages about the process. Default is `FALSE`.
+#' @return A data frame or object read from the RDS file.
+#' @export
+#'
+#' @examples
+#' # Example usage
+#' report_data <- get_report_db(
+#'   the_cntry = "DE",
+#'   timeframe = "-last_30_days",
+#'   file_name = "report_2024-12-25.rds",
+#'   verbose = TRUE
+#' )
+#' print(head(report_data))
+#' Retrieve Report Data from GitHub Repository
+#'
+#' This function retrieves a report for a specific country and timeframe
+#' from a GitHub repository hosting RDS files. The file is downloaded
+#' to a temporary location, read into R, and then deleted.
+#'
+#' @param the_cntry Character. The ISO country code (e.g., "DE", "US").
+#' @param timeframe Character or Numeric. Timeframe in days (e.g., "30", "90") or "yesterday" / "lifelong".
+#' @param ds Character. A timestamp or identifier used to construct the file name (e.g., "2024-12-25").
+#' @param verbose Logical. Whether to print messages about the process. Default is `FALSE`.
+#' @return A data frame or object read from the RDS file.
+#' @export
+#'
+#' @examples
+#' # Example usage
+#' report_data <- get_report_db(
+#'   the_cntry = "DE",
+#'   timeframe = 30,
+#'   ds = "2024-12-25",
+#'   verbose = TRUE
+#' )
+#' print(head(report_data))
+get_report_db <- function(the_cntry, timeframe, ds, verbose = FALSE) {
+    # Validate inputs
+    if (missing(the_cntry) || missing(timeframe) || missing(ds)) {
+        stop("All parameters (`the_cntry`, `timeframe`, `ds`) are required.")
+    }
+
+    # Construct the timeframe string
+    if (is.numeric(timeframe)) {
+        tf_string <- paste0("-last_", timeframe, "_days")
+    } else if (timeframe %in% c("yesterday", "lifelong")) {
+        tf_string <- paste0("-", timeframe)
+    } else {
+        stop("Invalid `timeframe` value. Must be numeric (e.g., 30, 90) or 'yesterday' / 'lifelong'.")
+    }
+
+    # Construct the file name
+    file_name <- paste0(ds, ".rds")
+
+    # Construct the URL
+    url <- paste0(
+        "https://github.com/favstats/meta_ad_reports/releases/download/",
+        the_cntry, tf_string, "/",
+        file_name
+    )
+
+    # Temporary file path
+    temp_file <- tempfile(fileext = ".rds")
+
+    if (verbose) {
+        message("Constructed URL: ", url)
+        message("Downloading to temporary file: ", temp_file)
+    }
+
+    # Attempt to download and read the RDS file
+    tryCatch({
+        download.file(url, destfile = temp_file, mode = "wb")
+        if (verbose) {
+            message("File successfully downloaded.")
+        }
+
+        # Read the RDS file
+        data <- readRDS(temp_file)
+        if (verbose) {
+            message("Data successfully read from the RDS file.")
+        }
+
+        # Return the data
+        return(data)
+    }, error = function(e) {
+        stop("Failed to retrieve or parse the RDS file. Error: ", e$message)
+    }, finally = {
+        # Ensure the temporary file is deleted
+        if (file.exists(temp_file)) {
+            file.remove(temp_file)
+            if (verbose) {
+                message("Temporary file deleted.")
+            }
+        }
+    })
+}
+
+
+# report_data <- get_report_db(
+#   the_cntry = "DE",
+#   timeframe = 7,
+#   ds = "2024-10-25",
+#   verbose = TRUE
+# )
diff --git a/R/get_page_insights.R b/R/get_page_insights.R
@@ -42,30 +42,105 @@ ua_list <- c(
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
 )
+
+if("page_info" %in% include_info ){
+  include_page_info <- T
+} else {
+  include_page_info <- F
+
+}
+
 ua <- sample(ua_list, 1)
 # print(ua)
 
 # pageid <- "7860876103"
 # timeframe <- "90"
 
-fetch_page_info <- ifelse("page_info" %in% include_info, "true", "false")
+# Define static parameters
+static_params <- list(
+  av = "0",                        # Likely application version; may not change often. Optional.
+  "_aaid" = "0",                   # Anonymous Advertising ID; unique to the visitor. Required for tracking purposes.
+  user = "0",                      # User identifier; likely session-based or placeholder. Required.
+  a = "1",                         # Arbitrary request parameter; purpose unclear but likely required.
+  req = "3",                       # Request parameter; often a sequence or batch request identifier. Likely required.
+  hs = "19797.BP%3ADEFAULT.2.0..0.0", # Host session or configuration metadata; required for server-side routing.
+  # dpr = "1",                       # Device Pixel Ratio; reflects screen resolution. Optional but often included.
+  ccg = "EXCELLENT",               # Connection grade; describes network quality. Optional but useful for server-side optimizations.
+  # rev = "1012093869",              # Revision/version number; likely application or API version. Required.
+  # s = "sbbnic%3Awquopy%3A7r1j3c",  # Session token or tracking identifier; unique to the visitor. Required.
+  # hsi = "7346737420686302672",     # Hashed Session ID; unique to the visitor. Required.
+  # dyn = "7xe6Eiw_K9zo5ObwKBAgc9o2exu13wqojyUW3qi4EoxW4E7SewXwCwfW7oqx60Vo1upEK12wvk1bwbG78b87C2m3K2y11wBw5Zx62G3i1ywdl0Fw4Hwp8kwyx2cU8EmwoHwrUcUjwVw9O7bK2S2W2K4EG1Mxu16wciaw4JwJwSyES0gq0K-1LwqobU2cwmo6O1Fw44wt8",
+  # Dynamic parameters encoded in a proprietary format; likely unique to each request. Required.
+
+  csr = "",                        # CSRF token; placeholder here, likely required in some contexts.
+  # lsd = "AVo6-wl7l1Q",             # Login session data; required for session validation.
+  # jazoest = "2881",                # CSRF-related field; required for security checks.
+  # spin_r = "1012093869",           # Spin-related metadata (server-specific session management). Required.
+  # spin_b = "trunk",                # Backend branch/version. Required for routing to the correct API version.
+  # spin_t = "1710545602",           # Server timestamp. Required for ensuring request freshness.
+  `_jssesw` = "1",                 # Encoded session value. Required for session management.
+  fb_api_caller_class = "RelayModern", # API metadata describing the client. Required.
+  fb_api_req_friendly_name = "AdLibraryMobileFocusedStateProviderQuery", # API-friendly name for request logging. Optional.
+  server_timestamps = "true",      # Flag indicating server timestamps should be included. Likely required.
+  doc_id = "7193625857423421"      # Unique document ID for the query. Required for identifying the request schema.
+)
+
+# Construct variables
+variables <- jsonlite::toJSON(
+  list(
+    adType = "POLITICAL_AND_ISSUE_ADS",      # Type of ads to query. Required.
+    audienceTimeframe = timeframe,          # Timeframe for audience data (e.g., LAST_30_DAYS). Required.
+    country = iso2c,                        # Country ISO code (e.g., "DE"). Required.
+    viewAllPageID = pageid,                 # Page ID for which data is being queried. Required.
+    fetchPageInfo = include_page_info,      # Boolean flag to fetch page-specific information. Optional.
+    fetchSharedDisclaimers = TRUE,          # Boolean flag to include shared disclaimers. Optional but useful.
+    active_status = "ALL",                  # Filter for active/inactive ads. Required.
+    ad_type = "POLITICAL_AND_ISSUE_ADS",    # Type of ads (repeated for clarity). Required.
+    bylines = list(),                       # List of bylines to filter ads. Optional.
+    # collation_token = "7ca3912f-0148-43ce-83e4-9a68ef656e4d",
+    # Unique token for grouping or collation; may be session-based. Likely required.
+
+    content_languages = list(),             # Filter for content languages. Optional.
+    count = 30,                             # Number of results to fetch. Optional but usually required for pagination.
+    countries = list(iso2c),                # List of countries for filtering (repeated for clarity). Required.
+    excluded_ids = list(),                  # IDs to exclude from results. Optional.
+    full_text_search_field = "ALL",         # Full-text search field filter. Optional.
+    group_by_modes = list(),                # Grouping modes for results. Optional.
+    search_type = "PAGE",                   # Type of search (e.g., by page). Required.
+    # session_id = "1678877b-700b-485a-abb0-60efcb6b4019",
+    # Unique session identifier for the query. Required for tracking.
+
+    sort_data = list(
+      mode = "SORT_BY_RELEVANCY_MONTHLY_GROUPED", # Sorting mode. Required.
+      direction = "ASCENDING"                    # Sorting direction. Required.
+    )
+  ),
+  auto_unbox = TRUE
+)
+
+static_params$variables <- URLencode(variables)
+body <- paste(names(static_params), static_params, sep = "=", collapse = "&")
 
+# print("Constructed body:")
+# print(body)
 
 resp <- request("https://www.facebook.com/api/graphql/") %>%
-    httr2::req_headers(
-    `Accept-Language` = paste0(lang, ',', stringr::str_split(lang, "-") %>% unlist() %>% .[1],';q=0.5'),
+  req_headers(
+    `Accept-Language` = paste0(
+      lang, ",", stringr::str_split(lang, "-") %>% unlist() %>% .[1], ";q=0.5"
+    ),
     `sec-fetch-site` = "same-origin",
     `user-agent` = ua
   ) %>%
-    httr2::req_body_raw(glue::glue("av=0&_aaid=0&user=0&a=1&req=3&hs=19797.BP%3ADEFAULT.2.0..0.0&dpr=1&ccg=EXCELLENT&rev=1012093869&s=sbbnic%3Awquopy%3A7r1j3c&hsi=7346737420686302672&dyn=7xe6Eiw_K9zo5ObwKBAgc9o2exu13wqojyUW3qi4EoxW4E7SewXwCwfW7oqx60Vo1upEK12wvk1bwbG78b87C2m3K2y11wBw5Zx62G3i1ywdl0Fw4Hwp8kwyx2cU8EmwoHwrUcUjwVw9O7bK2S2W2K4EG1Mxu16wciaw4JwJwSyES0gq0K-1LwqobU2cwmo6O1Fw44wt8&csr=&lsd=AVo6-wl7l1Q&jazoest=2881&spin_r=1012093869&spin_b=trunk&spin_t=1710545602&_jssesw=1&fb_api_caller_class=RelayModern&fb_api_req_friendly_name=AdLibraryMobileFocusedStateProviderQuery&variables=%7B%22adType%22%3A%22POLITICAL_AND_ISSUE_ADS%22%2C%22audienceTimeframe%22%3A%22{timeframe}%22%2C%22country%22%3A%22{iso2c}%22%2C%22viewAllPageID%22%3A%22{pageid}%22%2C%22fetchPageInfo%22%3A{fetch_page_info}%2C%22fetchSharedDisclaimers%22%3Atrue%2C%22active_status%22%3A%22ALL%22%2C%22ad_type%22%3A%22POLITICAL_AND_ISSUE_ADS%22%2C%22bylines%22%3A%5B%5D%2C%22collation_token%22%3A%227ca3912f-0148-43ce-83e4-9a68ef656e4d%22%2C%22content_languages%22%3A%5B%5D%2C%22count%22%3A30%2C%22countries%22%3A%5B%22{iso2c}%22%5D%2C%22excluded_ids%22%3A%5B%5D%2C%22full_text_search_field%22%3A%22ALL%22%2C%22group_by_modes%22%3A%5B%5D%2C%22image_id%22%3Anull%2C%22location%22%3Anull%2C%22media_type%22%3A%22ALL%22%2C%22page_ids%22%3A%5B%5D%2C%22pagination_mode%22%3Anull%2C%22potential_reach_input%22%3Anull%2C%22publisher_platforms%22%3A%5B%5D%2C%22query_string%22%3A%22%22%2C%22regions%22%3A%5B%5D%2C%22search_type%22%3A%22PAGE%22%2C%22session_id%22%3A%221678877b-700b-485a-abb0-60efcb6b4019%22%2C%22sort_data%22%3A%7B%22mode%22%3A%22SORT_BY_RELEVANCY_MONTHLY_GROUPED%22%2C%22direction%22%3A%22ASCENDING%22%7D%2C%22source%22%3Anull%2C%22start_date%22%3Anull%2C%22view_all_page_id%22%3A%22{pageid}%22%7D&server_timestamps=true&doc_id=7193625857423421"), "application/x-www-form-urlencoded") %>%
-    httr2::req_perform()
+  req_body_raw(body, "application/x-www-form-urlencoded") %>%
+  req_perform()
 
 out <- resp %>%
   httr2::resp_body_html() %>%
   rvest::html_element("p") %>%
   rvest::html_text() %>%
-  str_split_1('(?<=\\})\\s*(?=\\{)') %>%
-  map(jsonlite::fromJSON)
+  stringr::str_split_1('(?<=\\})\\s*(?=\\{)') %>%
+  purrr::map(jsonlite::fromJSON)
 
 if(!is.null(out[[1]][["errors"]][["description"]])){
   message(out[[1]][["errors"]][["description"]])
@@ -79,14 +154,14 @@ if( "page_info" %in% include_info) {
 
     if ("page_info" %in% include_info & "targeting_info" %in% include_info) {
       if (join_info) {
-        return(tibble(page_id = pageid, no_data = T))
+        return(tibble::tibble(page_id = pageid, no_data = T))
       } else {
-        return(list(page_info = tibble(page_id = pageid, no_data = T),
-                    targeting_info = tibble(page_id = pageid, no_data = T)))
+        return(list(page_info = tibble::tibble(page_id = pageid, no_data = T),
+                    targeting_info = tibble::tibble(page_id = pageid, no_data = T)))
       }
 
     } else {
-      return(tibble(page_id = pageid, no_data = T))
+      return(tibble::tibble(page_id = pageid, no_data = T))
     }
 
 
@@ -105,7 +180,7 @@ if( "page_info" %in% include_info) {
       dplyr::mutate_all(as.character) %>%
       dplyr::mutate(shared_disclaimer_info = my_dataframe$page_id[1])
   } else {
-    page_info2 <- tibble(no_shared_disclaimer  = T)
+    page_info2 <- tibble::tibble(no_shared_disclaimer  = T)
   }
 
 
@@ -127,7 +202,7 @@ if( "page_info" %in% include_info) {
   if(!is.null(address_raw)){
       address <- address_raw %>% purrr::flatten()
   } else {
-    address <- tibble(no_address  = T)
+    address <- tibble::tibble(no_address  = T)
   }
 
 
@@ -194,7 +269,7 @@ if( "page_info" %in% include_info & "targeting_info" %in% include_info  ) {
 
   if(join_info){
     fin <- page_info %>%
-      left_join(targeting_info, by = "page_id")
+      dplyr::left_join(targeting_info, by = "page_id")
   } else {
     fin <- list(page_info, targeting_info)
   }
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,4 @@ @@
     .Rdata
     .httr-oauth
     .DS_Store
+    data