From c139ba4d1f466236239030b303ff1fac4100fbd6 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Tue, 10 Dec 2024 16:56:23 -0500 Subject: [PATCH 01/12] Bug 1922986 - Create script that exports BMO data as JSON suitable for import into a BigQuery instance in GCP --- .circleci/config.yml | 2 +- .github/workflows/ci.yml | 2 +- conf/checksetup_answers.txt | 6 + docker-compose.test.yml | 11 + docker-compose.yml | 28 + docker/bigquery/Dockerfile | 3 + docker/bigquery/data.yaml | 185 +++++ extensions/BMO/Extension.pm | 42 + extensions/BMO/bin/export_bmo_etl.pl | 727 ++++++++++++++++++ extensions/BMO/t/bmo/bmo_etl.t | 136 ++++ .../params/editparams-current_panel.html.tmpl | 7 + 11 files changed, 1147 insertions(+), 2 deletions(-) create mode 100644 docker/bigquery/Dockerfile create mode 100644 docker/bigquery/data.yaml create mode 100644 extensions/BMO/bin/export_bmo_etl.pl create mode 100644 extensions/BMO/t/bmo/bmo_etl.t diff --git a/.circleci/config.yml b/.circleci/config.yml index 384733218a..32cf21b39a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -271,7 +271,7 @@ jobs: name: run bmo specific tests command: | [[ -f build_info/only_version_changed.txt ]] && exit 0 - docker-compose -f docker-compose.test.yml run --build bmo.test test_bmo -q -f t/bmo/*.t + docker-compose -f docker-compose.test.yml run --build bmo.test test_bmo -q -f t/bmo/*.t extensions/*/t/bmo/*.t - *store_log workflows: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1c67b0f7ba..9cdfa8b220 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,7 +35,7 @@ jobs: - name: Build Docker test images run: docker-compose -f docker-compose.test.yml build - name: Run bmo specific tests - run: docker-compose -f docker-compose.test.yml run -e CI=1 bmo.test test_bmo -q -f t/bmo/*.t + run: docker-compose -f docker-compose.test.yml run -e CI=1 bmo.test test_bmo -q -f t/bmo/*.t extensions/*/t/bmo/*.t test_selenium_1: runs-on: ubuntu-latest diff --git a/conf/checksetup_answers.txt b/conf/checksetup_answers.txt index 4b8e830f76..251d0f3fb2 100644 --- a/conf/checksetup_answers.txt +++ b/conf/checksetup_answers.txt @@ -60,6 +60,12 @@ $answer{'sitemapindex_google_host'} = 'gcs'; $answer{'sitemapindex_google_bucket'} = 'sitemapindex'; $answer{'sitemapindex_google_service_account'} = 'test'; +$answer{'bmo_etl_enabled'} = 1; +$answer{'bmo_etl_base_url'} = 'http://bq:9050'; +$answer{'bmo_etl_service_account'} = 'test'; +$answer{'bmo_etl_project_id'} = 'test'; +$answer{'bmo_etl_dataset_id'} = 'bugzilla'; + $answer{'duo_uri'} = 'http://localhost:8001'; $answer{'duo_client_id'} = '6rZ3KnrL04uyGjLd8foO'; $answer{'duo_client_secret'} = '3vg6cm0Gj0DpC6ZJACXdZ1NrVRi1AhkwjfXnlFaJ'; diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 152f86393e..563fc601c1 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -37,6 +37,7 @@ services: - memcached - s3 - gcs + - bq externalapi.test: build: *build_bmo @@ -69,3 +70,13 @@ services: - ./docker/gcs/attachments:/data/attachments - ./docker/gcs/sitemapindex:/data/sitemapindex - ./docker/gcs/mining:/data/mining + + bq: + build: + context: ./docker/bigquery + dockerfile: Dockerfile + ports: + - 9050:9050 + working_dir: /work + command: | + --project=test --data-from-yaml=/data.yaml diff --git a/docker-compose.yml b/docker-compose.yml index c238bb07a5..fb9da1f7e0 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -42,7 +42,9 @@ services: - memcached - s3 - gcs + - bigquery - externalapi.test + - bq ports: - 8000:8000 @@ -136,6 +138,18 @@ services: - bmo-gcs-sitemapindex:/data/sitemapindex - bmo-gcs-mining:/data/mining + bigquery: + platform: linux/x86_64 + image: ghcr.io/goccy/bigquery-emulator:latest + ports: + - 9050:9050 + volumes: + - bmo-bigquery-data:/work + - ./docker/bigquery/data.yaml:/work/data.yaml + working_dir: /work + command: | + --project=test --data-from-yaml=/work/data.yaml --log-level=debug + externalapi.test: platform: linux/x86_64 build: *bmo_build @@ -143,6 +157,19 @@ services: ports: - 8001:8001 + bq: + platform: linux/x86_64 + build: + context: ./docker/bigquery + dockerfile: Dockerfile + ports: + - 9050:9050 + volumes: + - bmo-bq-data:/work + working_dir: /work + command: | + --project=test --data-from-yaml=/data.yaml --log-level=debug + volumes: bmo-mysql-db: bmo-data-dir: @@ -150,3 +177,4 @@ volumes: bmo-gcs-attachments: bmo-gcs-sitemapindex: bmo-gcs-mining: + bmo-bq-data: diff --git a/docker/bigquery/Dockerfile b/docker/bigquery/Dockerfile new file mode 100644 index 0000000000..7cb792483f --- /dev/null +++ b/docker/bigquery/Dockerfile @@ -0,0 +1,3 @@ +FROM ghcr.io/goccy/bigquery-emulator:0.6.5 + +COPY data.yaml /data.yaml diff --git a/docker/bigquery/data.yaml b/docker/bigquery/data.yaml new file mode 100644 index 0000000000..13b93aa469 --- /dev/null +++ b/docker/bigquery/data.yaml @@ -0,0 +1,185 @@ +projects: + - id: test + datasets: + - id: bugzilla + tables: + - id: bugs + columns: + - name: id + type: INTEGER + - name: assignee_id + type: INTEGER + - name: url + type: STRING + - name: severity + type: STRING + - name: status + type: STRING + - name: type + type: STRING + - name: crash_signature + type: STRING + - name: component + type: STRING + - name: creation_ts + type: TIMESTAMP + - name: updated_ts + type: TIMESTAMP + - name: op_sys + type: STRING + - name: priority + type: STRING + - name: product + type: STRING + - name: platform + type: STRING + - name: reporter_id + type: INTEGER + - name: resolution + type: STRING + - name: summary + type: STRING + - name: whiteboard + type: STRING + - name: milestone + type: STRING + - name: version + type: STRING + - name: team_name + type: STRING + - name: group + type: STRING + - name: classification + type: STRING + - name: is_public + type: BOOLEAN + - name: comment_count + type: INTEGER + - name: cc_count + type: INTEGER + - name: vote_count + type: INTEGER + - name: snapshot_date + type: DATE + - id: attachments + columns: + - name: id + type: INT64 + - name: bug_id + type: INT64 + - name: creation_ts + type: TIMESTAMP + - name: description + type: STRING + - name: filename + type: STRING + - name: is_obsolete + type: BOOL + - name: content_type + type: STRING + - name: updated_ts + type: TIMESTAMP + - name: submitter_id + type: INT64 + - name: snapshot_date + type: DATE + - id: flags + columns: + - name: attachment_id + type: INT64 + - name: bug_id + type: INT64 + - name: creation_ts + type: TIMESTAMP + - name: updated_ts + type: TIMESTAMP + - name: requestee_id + type: INT64 + - name: setter_id + type: INT64 + - name: name + type: STRING + - name: value + type: STRING + - name: snapshot_date + type: DATE + - id: tracking_flags + columns: + - name: bug_id + type: INT64 + - name: name + type: STRING + - name: value + type: STRING + - name: snapshot_date + type: DATE + - id: keywords + columns: + - name: bug_id + type: INT64 + - name: keyword + type: STRING + - name: snapshot_date + type: DATE + - id: see_also + columns: + - name: bug_id + type: INT64 + - name: url + type: STRING + - name: snapshot_date + type: DATE + - id: bug_mentors + columns: + - name: bug_id + type: INT64 + - name: user_id + type: INT64 + - name: snapshot_date + type: DATE + - id: bug_dependencies + columns: + - name: bug_id + type: INT64 + - name: depends_on_id + type: INT64 + - name: snapshot_date + type: DATE + - id: bug_regressions + columns: + - name: bug_id + type: INT64 + - name: regresses_id + type: INT64 + - name: snapshot_date + type: DATE + - id: bug_duplicates + columns: + - name: bug_id + type: INT64 + - name: duplicate_of_id + type: INT64 + - name: snapshot_date + type: DATE + - id: users + columns: + - name: id + type: INT64 + - name: last_seen + type: TIMESTAMP + - name: email + type: STRING + - name: nick + type: STRING + - name: name + type: STRING + - name: is_staff + type: BOOL + - name: is_trusted + type: BOOL + - name: ldap_email + type: STRING + - name: is_new + type: BOOL + - name: snapshot_date + type: DATE diff --git a/extensions/BMO/Extension.pm b/extensions/BMO/Extension.pm index b6bbcf7af5..9b3f7a2f68 100755 --- a/extensions/BMO/Extension.pm +++ b/extensions/BMO/Extension.pm @@ -1384,6 +1384,21 @@ sub db_schema_abstract_schema { ], INDEXES => [job_last_run_name_idx => {FIELDS => ['name'], TYPE => 'UNIQUE',},], }; + $args->{schema}->{bmo_etl_cache} = { + FIELDS => [ + id => {TYPE => 'INT3', NOTNULL => 1,}, + snapshot_date => {TYPE => 'DATETIME', NOTNULL => 1,}, + table_name => {TYPE => 'VARCHAR(100)', NOTNULL => 1,}, + data => {TYPE => 'LONGBLOB', NOTNULL => 1,}, + ], + INDEXES => + [bmo_etl_cache_idx => {FIELDS => ['id', 'snapshot_date', 'table_name']}], + }; + $args->{schema}->{bmo_etl_locked} = { + FIELDS => [ + value => {TYPE => 'VARCHAR(20)', NOTNULL => 1,}, + ], + }; } sub install_update_db { @@ -2588,6 +2603,33 @@ sub config_modify_panels { name => 'enable_triaged_keyword', type => 'b', }; + push @{$args->{panels}->{reports}->{params}}, + { + name => 'bmo_etl_enabled', + type => 'b', + default => 0, + }; + push @{$args->{panels}->{reports}->{params}}, + { + name => 'bmo_etl_base_url', + type => 't', + }; + push @{$args->{panels}->{reports}->{params}}, + { + name => 'bmo_etl_service_account', + type => 't', + }; + push @{$args->{panels}->{reports}->{params}}, + { + name => 'bmo_etl_project_id', + type => 't', + }; + push @{$args->{panels}->{reports}->{params}}, + { + name => 'bmo_etl_dataset_id', + type => 't', + }; + } sub comment_after_add_tag { diff --git a/extensions/BMO/bin/export_bmo_etl.pl b/extensions/BMO/bin/export_bmo_etl.pl new file mode 100644 index 0000000000..6a2c6a93b3 --- /dev/null +++ b/extensions/BMO/bin/export_bmo_etl.pl @@ -0,0 +1,727 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# This Source Code Form is "Incompatible With Secondary Licenses", as +# defined by the Mozilla Public License, v. 2.0. + +use 5.10.1; +use strict; +use warnings; +use lib qw(. lib local/lib/perl5); + +use Bugzilla; +use Bugzilla::Attachment; +use Bugzilla::Bug; +use Bugzilla::Constants; +use Bugzilla::Flag; +use Bugzilla::User; + +use HTTP::Headers; +use HTTP::Request; +use IO::Compress::Gzip qw(gzip $GzipError); +use IO::Uncompress::Gunzip qw(gunzip $GunzipError); +use LWP::UserAgent::Determined; +use Mojo::File qw(path); +use Mojo::JSON qw(decode_json encode_json false true); +use Mojo::Util qw(getopt); + +# BigQuery API cannot handle payloads larger than 10MB so +# we will send data in blocks. +use constant API_BLOCK_COUNT => 1000; + +Bugzilla->usage_mode(USAGE_MODE_CMDLINE); +getopt 't|test' => \my $test, 'v|verbose' => \my $verbose, 's|snapshot-date=s' => \my $snapshot_date; + +# Sanity checks +Bugzilla->params->{bmo_etl_enabled} || die "BMO ETL not enabled.\n"; + +my $base_url = Bugzilla->params->{bmo_etl_base_url}; +$base_url || die "Invalid BigQuery base URL.\n"; + +my $project_id = Bugzilla->params->{bmo_etl_project_id}; +$project_id || die "Invalid BigQuery product ID.\n"; + +my $dataset_id = Bugzilla->params->{bmo_etl_dataset_id}; +$dataset_id || die "Invalid BigQuery dataset ID.\n"; + +# Check to make sure another instance is not currently running +check_and_set_lock(); + +# Use replica if available +my $dbh = Bugzilla->switch_to_shadow_db(); + +my $ua = LWP::UserAgent::Determined->new( + agent => 'Bugzilla', + keep_alive => 10, + requests_redirectable => [qw(GET HEAD DELETE PUT)], +); +$ua->timing('1,2,4,8,16,32'); +$ua->timeout(30); +if (my $proxy = Bugzilla->params->{proxy_url}) { + $ua->proxy(['https', 'http'], $proxy); +} + +# This date will be added to each object as it is being sent +if (!$snapshot_date) { + $snapshot_date = $dbh->selectrow_array( + 'SELECT ' . $dbh->sql_date_format('LOCALTIMESTAMP(0)', '%Y-%m-%d')); +} + +# In order to avoid entering duplicate data, we will first query BigQuery +# to make sure other entries with this date are not already present. +check_for_duplicates(); + +### Bugs + +my $table_name = 'bugs'; +my $count = 0; +my $last_id = 0; + +my $total = $dbh->selectrow_array('SELECT COUNT(*) FROM bugs'); +print "Processing $total $table_name.\n" if $verbose; + +my $sth + = $dbh->prepare( + 'SELECT bug_id AS id, delta_ts AS modification_time FROM bugs WHERE bug_id > ? ORDER BY bug_id LIMIT ' + . API_BLOCK_COUNT); + +while ($count < $total) { + my @bugs = (); + + $sth->execute($last_id); + + while (my ($id, $mod_time) = $sth->fetchrow_array()) { + print "Processing id $id with mod_time of $mod_time.\n" if $verbose; + + # First check to see if we have a cached version with the same modification date + my $data = get_cache($id, $table_name, $mod_time); + if (!$data) { + print "$table_name id $id with time $mod_time not found in cache.\n" + if $verbose; + + my $obj = Bugzilla::Bug->new($id); + + $data = { + id => $obj->id, + assignee_id => $obj->assigned_to->id, + url => $obj->bug_file_loc, + severity => $obj->bug_severity, + status => $obj->bug_status, + type => $obj->bug_type, + crash_signature => $obj->cf_crash_signature, + component => $obj->component, + creation_ts => $obj->creation_ts, + updated_ts => $obj->delta_ts, + op_sys => $obj->op_sys, + priority => $obj->priority, + product => $obj->product, + platform => $obj->rep_platform, + reporter_id => $obj->reporter->id, + resolution => $obj->resolution, + summary => $obj->short_desc, + whiteboard => $obj->status_whiteboard, + milestone => $obj->target_milestone, + version => $obj->version, + team_name => $obj->component_obj->team_name, + classification => $obj->classification, + comment_count => $obj->comment_count, + vote_count => $obj->votes, + group => (join ',', map { $_->name } @{$obj->groups_in}), + is_public => (scalar @{$obj->groups_in} ? true : false), + cc_count => scalar @{$obj->cc || []}, + }; + + # Store a copy of the data for use in later executions + store_cache($obj->id, $table_name, $obj->delta_ts, $data); + } + + push @bugs, $data; + + $count++; + $last_id = $id; + } + + # Send the rows to the server + send_data($table_name, \@bugs, $count) if @bugs; +} + +### Attachments + +$table_name = 'attachments'; +$count = 0; +$last_id = 0; + +$total = $dbh->selectrow_array('SELECT COUNT(*) FROM attachments'); +print "Processing $total $table_name.\n" if $verbose; + +$sth + = $dbh->prepare( + 'SELECT attach_id AS id, modification_time FROM attachments WHERE attach_id > ? ORDER BY bug_id LIMIT ' + . API_BLOCK_COUNT); + +while ($count < $total) { + my @attachments = (); + + $sth->execute($last_id); + + while (my ($id, $mod_time) = $sth->fetchrow_array()) { + print "Processing id $id with mod_time of $mod_time.\n" if $verbose; + + # First check to see if we have a cached version with the same modification date + my $data = get_cache($id, $table_name, $mod_time); + if (!$data) { + print "$table_name id $id with time $mod_time not found in cache.\n" + if $verbose; + + my $obj = Bugzilla::Attachment->new($id); + + $data = { + id => $obj->id, + bug_id => $obj->bug_id, + creation_ts => $obj->attached, + description => $obj->description, + filename => $obj->filename, + content_type => $obj->contenttype, + updated_ts => $obj->modification_time, + submitter_id => $obj->attacher->id, + is_obsolete => ($obj->isobsolete ? true : false), + }; + + # Store a new copy of the data for use later + store_cache($obj->id, $table_name, $obj->modification_time, $data); + } + + push @attachments, $data; + + $count++; + $last_id = $id; + } + + # Send the rows to the server + send_data($table_name, \@attachments, $count) if @attachments; +} + +### Flags + +$table_name = 'flags'; +$count = 0; +$last_id = 0; + +$total = $dbh->selectrow_array('SELECT COUNT(*) FROM flags'); +print "Processing $total $table_name.\n" if $verbose; + +$sth + = $dbh->prepare( + 'SELECT id, modification_date FROM flags WHERE id > ? ORDER BY id LIMIT ' + . API_BLOCK_COUNT); + +while ($count < $total) { + my @flags = (); + + $sth->execute($last_id); + + while (my ($id, $mod_time) = $sth->fetchrow_array()) { + print "Processing id $id with mod_time of $mod_time.\n" if $verbose; + + # First check to see if we have a cached version with the same modification date + my $data = get_cache($id, $table_name, $mod_time); + if (!$data) { + print "$table_name id $id with time $mod_time not found in cache.\n" + if $verbose; + + my $obj = Bugzilla::Flag->new($id); + + $data = { + attachment_id => $obj->attach_id, + bug_id => $obj->bug_id, + creation_ts => $obj->creation_date, + updated_ts => $obj->modification_date, + requestee_id => $obj->requestee_id, + setter_id => $obj->setter_id, + name => $obj->type->name, + value => $obj->status, + }; + + # Store a new copy of the data for use later + store_cache($obj->id, $table_name, $obj->modification_date, $data); + } + + push @flags, $data; + + $count++; + $last_id = $id; + } + + # Send the rows to the server + send_data($table_name, \@flags, $count) if @flags; +} + +### Tracking Flags + +$table_name = 'tracking_flags'; +my $rows = $dbh->selectall_arrayref( + 'SELECT tracking_flags.name AS name, tracking_flags_bugs.bug_id AS bug_id, tracking_flags_bugs.value AS value + FROM tracking_flags_bugs + JOIN tracking_flags + ON tracking_flags_bugs.tracking_flag_id = tracking_flags.id + ORDER BY tracking_flags_bugs.bug_id', {Slice => {}} +); + +$total = scalar @{$rows}; +$count = 0; + +print "Processing $total $table_name.\n" if $verbose; + +my @results = (); +foreach my $row (@{$rows}) { + my $data + = {bug_id => $row->{bug_id}, name => $row->{name}, value => $row->{value},}; + + push @results, $data; + + $count++; + + # Send the rows to the server if we have a specific sized block' + # or we are at the last row + if (scalar @results == API_BLOCK_COUNT || $total == $count) { + send_data($table_name, \@results, $count); + @results = (); + } +} + +### Keywords + +$table_name = 'keywords'; +$rows = $dbh->selectall_arrayref( + 'SELECT bug_id, keyworddefs.name AS name + FROM keywords + JOIN keyworddefs + ON keywords.keywordid = keyworddefs.id + ORDER BY bug_id', {Slice => {}} +); + +$total = scalar @{$rows}; +$count = 0; + +print "Processing $total $table_name.\n" if $verbose; + +@results = (); +foreach my $row (@{$rows}) { + my $data = {bug_id => $row->{bug_id}, keyword => $row->{name},}; + + push @results, $data; + + $count++; + + # Send the rows to the server if we have a specific sized block' + # or we are at the last row + if (scalar @results == API_BLOCK_COUNT || $total == $count) { + send_data($table_name, \@results, $count); + @results = (); + } +} + +### See Also + +$table_name = 'see_also'; +$rows + = $dbh->selectall_arrayref( + 'SELECT bug_id, value, class FROM bug_see_also ORDER BY bug_id', + {Slice => {}}); + +$total = scalar @{$rows}; +$count = 0; + +print "Processing $total $table_name.\n" if $verbose; + +@results = (); +foreach my $row (@{$rows}) { + my $data = {bug_id => $row->{bug_id},}; + if ($row->{class} =~ /::Local/) { + $data->{url} + = Bugzilla->localconfig->urlbase . 'show_bug.cgi?id=' . $row->{value}; + } + else { + $data->{url} = $row->{value}; + } + + push @results, $data; + + $count++; + + # Send the rows to the server if we have a specific sized block' + # or we are at the last row + if (scalar @results == API_BLOCK_COUNT || $total == $count) { + send_data($table_name, \@results, $count); + @results = (); + } +} + +### Mentors + +$table_name = 'bug_mentors'; +$rows + = $dbh->selectall_arrayref( + 'SELECT bug_id, user_id FROM bug_mentors ORDER BY bug_id', + {Slice => {}}); + +$total = scalar @{$rows}; +$count = 0; + +print "Processing $total $table_name.\n" if $verbose; + +@results = (); +foreach my $row (@{$rows}) { + my $data = {bug_id => $row->{bug_id}, user_id => $row->{user_id}}; + + push @results, $data; + + $count++; + + # Send the rows to the server if we have a specific sized block' + # or we are at the last row + if (scalar @results == API_BLOCK_COUNT || $total == $count) { + send_data($table_name, \@results, $count); + @results = (); + } +} + +### Dependencies + +$table_name = 'bug_dependencies'; +$rows + = $dbh->selectall_arrayref( + 'SELECT blocked, dependson FROM dependencies ORDER BY blocked', + {Slice => {}}); + +$total = scalar @{$rows}; +$count = 0; + +print "Processing $total $table_name.\n" if $verbose; + +@results = (); +foreach my $row (@{$rows}) { + my $data = {bug_id => $row->{blocked}, depends_on_id => $row->{dependson}}; + + push @results, $data; + + $count++; + + # Send the rows to the server if we have a specific sized block' + # or we are at the last row + if (scalar @results == API_BLOCK_COUNT || $total == $count) { + send_data($table_name, \@results, $count); + @results = (); + } +} + +### Regressions + +$table_name = 'bug_regressions'; +$rows + = $dbh->selectall_arrayref('SELECT regresses, regressed_by FROM regressions', + {Slice => {}}); + +$total = scalar @{$rows}; +$count = 0; + +print "Processing $total $table_name.\n" if $verbose; + +@results = (); +foreach my $row (@{$rows}) { + my $data = {bug_id => $row->{regresses}, regresses_id => $row->{regressed_by},}; + + push @results, $data; + + $count++; + + # Send the rows to the server if we have a specific sized block' + # or we are at the last row + if (scalar @results == API_BLOCK_COUNT || $total == $count) { + send_data($table_name, \@results, $count); + @results = (); + } +} + +### Duplicates + +$table_name = 'bug_duplicates'; +$rows = $dbh->selectall_arrayref('SELECT dupe, dupe_of FROM duplicates', + {Slice => {}}); + +$total = scalar @{$rows}; +$count = 0; + +print "Processing $total $table_name.\n" if $verbose; + +@results = (); +foreach my $row (@{$rows}) { + my $data = {bug_id => $row->{dupe}, duplicate_of_id => $row->{dupe_of},}; + + push @results, $data; + + $count++; + + # Send the rows to the server if we have a specific sized block' + # or we are at the last row + if (scalar @results == API_BLOCK_COUNT || $total == $count) { + send_data($table_name, \@results, $count); + @results = (); + } +} + +### Users + +$table_name = 'users'; +$count = 0; +$last_id = 0; + +$total = $dbh->selectrow_array('SELECT COUNT(*) FROM profiles'); +print "Processing $total $table_name.\n" if $verbose; + +$sth + = $dbh->prepare( + 'SELECT userid, modification_ts FROM profiles WHERE userid > ? ORDER BY userid LIMIT ' + . API_BLOCK_COUNT); + +while ($count < $total) { + my @users = (); + + $sth->execute($last_id); + + while (my ($id, $mod_time) = $sth->fetchrow_array()) { + print "Processing id $id with mod_time of $mod_time.\n" if $verbose; + + # First check to see if we have a cached version with the same modification date + my $data = get_cache($id, $table_name, $mod_time); + if (!$data) { + print "$table_name id $id with time $mod_time not found in cache.\n" + if $verbose; + + my $obj = Bugzilla::User->new($id); + + $data = { + id => $obj->id, + last_seen => $obj->last_seen_date, + email => $obj->email, + nick => $obj->nick, + name => $obj->name, + ldap_email => $obj->ldap_email, + is_new => $obj->is_new, + is_staff => ($obj->in_group('mozilla-employee-confidential') ? true : false), + is_trusted => ($obj->in_group('editbugs') ? true : false), + }; + + # Store a new copy of the data for use later + store_cache($obj->id, $table_name, $obj->modification_ts, $data); + } + + push @users, $data; + + $count++; + $last_id = $id; + } + + # Send the rows to the server + send_data($table_name, \@users, $count) if @users; +} + +# Delete lock from bmo_etl_locked +Bugzilla->dbh_main->do('DELETE FROM bmo_etl_locked'); + +# Functions + +sub get_cache { + my ($id, $table, $timestamp) = @_; + + print "Retreiving data from $table for $id with time $timestamp.\n" if $verbose; + + # Retrieve compressed JSON from cache table if it exists + my $gzipped_data = $dbh->selectrow_array( + 'SELECT data FROM bmo_etl_cache WHERE id = ? AND table_name = ? AND snapshot_date = ?', + undef, $id, $table, $timestamp + ); + return undef if !$gzipped_data; + + # First uncompress the JSON and then decode it back to Perl data + my $data; + gunzip \$gzipped_data => \$data or die "gunzip failed: $GunzipError\n"; + return decode_json($data); +} + +sub store_cache { + my ($id, $table, $timestamp, $data) = @_; + + print "Storing data into $table for $id with time $timestamp.\n" if $verbose; + + # Encode the perl data into JSON + $data = encode_json($data); + + # Compress the JSON to save space in the DB + my $gzipped_data; + gzip \$data => \$gzipped_data or die "gzip failed: $GzipError\n"; + + # We need to use the main DB for write operations + my $main_dbh = Bugzilla->dbh_main; + + # Clean out outdated JSON + $main_dbh->do('DELETE FROM bmo_etl_cache WHERE id = ? AND table_name = ?', + undef, $id, $table); + + # Enter new cached JSON + $main_dbh->do( + 'INSERT INTO bmo_etl_cache (id, table_name, snapshot_date, data) VALUES (?, ?, ?, ?)', + undef, $id, $table, $timestamp, $gzipped_data + ); +} + +sub send_data { + my ($table, $all_rows, $current_count) = @_; + + print 'Sending ' + . scalar @{$all_rows} + . " rows to table $table using BigQuery API\n" + if $verbose; + + # Add the same snapshot date to every row sent + foreach my $row (@{$all_rows}) { + $row->{snapshot_date} = $snapshot_date; + } + + my @json_rows = (); + foreach my $row (@{$all_rows}) { + push @json_rows, {json => $row}; + } + + my $big_query = {rows => \@json_rows}; + + if ($test) { + my $filename + = bz_locations()->{'datadir'} . '/' + . $snapshot_date . '-' + . $table . '-' + . $current_count . '.json'; + + print "Writing data to $filename\n" if $verbose; + + my $fh = path($filename)->open('>>'); + print $fh encode_json($big_query) . "\n"; + close $fh || die "Could not close $filename: $!\n"; + + return; + } + + my $http_headers = HTTP::Headers->new; + + # Do not attempt to get access token if running in test environment + if ($base_url !~ /^http:\/\/[^\/]+:9050/) { + my $access_token = _get_access_token(); + $http_headers->header(Authorization => 'Bearer ' . $access_token); + } + + my $full_path = sprintf 'projects/%s/datasets/%s/tables/%s/insertAll', + $project_id, $dataset_id, $table; + + print "Sending to $base_url/$full_path\n" if $verbose; + + my $request = HTTP::Request->new('POST', "$base_url/$full_path", $http_headers); + $request->header('Content-Type' => 'application/json'); + $request->content(encode_json($big_query)); + + my $res = $ua->request($request); + if (!$res->is_success) { + die 'Google Big Query insert failure: ' . $res->content; + } +} + +sub _get_access_token { + state $access_token; # We should only need to get this once + state $token_expiry; + + # If we already have a token and it has not expired yet, just return it + if ($access_token && time < $token_expiry) { + return $access_token; + } + +# Google Kubernetes allows for the use of Workload Identity. This allows +# us to link two serice accounts together and give special access for applications +# running under Kubernetes. We use the special access to get an OAuth2 access_token +# that can then be used for accessing the the Google API such as BigQuery. + my $url + = sprintf + 'http://metadata.google.internal/computeMetadata/v1/instance/service-accounts/%s/token', + Bugzilla->params->{bmo_etl_service_account}; + + my $http_headers = HTTP::Headers->new; + $http_headers->header('Metadata-Flavor' => 'Google'); + + my $request = HTTP::Request->new('GET', $url, $http_headers); + + my $res = $ua->request($request); + + if (!$res->is_success) { + die 'Google access token failure: ' . $res->content; + } + + my $result = decode_json($res->decoded_content); + $access_token = $result->{access_token}; + $token_expiry = time + $result->{expires_in}; + + return $access_token; +} + +sub check_and_set_lock { + return if $test; # No need if just dumping test files + + my $dbh_main = Bugzilla->dbh_main; + my $locked = $dbh_main->selectrow_array('SELECT COUNT(*) FROM bmo_etl_locked'); + if ($locked) { + die "Another process has set a lock. Exiting\n"; + } + $dbh_main->do('INSERT INTO bmo_etl_locked VALUES (?)', undef, 'locked'); +} + +sub check_for_duplicates { + return if $test; # no need if just dumping test files + + print "Checking for duplicate data for snapshot date $snapshot_date\n" + if $verbose; + + my $http_headers = HTTP::Headers->new; + + # Do not attempt to get access token if running in test environment + if ($base_url !~ /^http:\/\/[^\/]+:9050/) { + my $access_token = _get_access_token(); + $http_headers->header(Authorization => 'Bearer ' . $access_token); + } + + my $full_path = "projects/$project_id/queries"; + + print "Querying $base_url/$full_path\n" if $verbose; + + my $query + = {query => + "SELECT count(*) FROM ${project_id}.${dataset_id}.bugs WHERE snapshot_date = '$snapshot_date'" + }; + + my $request = HTTP::Request->new('POST', "$base_url/$full_path", $http_headers); + $request->header('Content-Type' => 'application/json'); + $request->content(encode_json($query)); + + my $res = $ua->request($request); + if (!$res->is_success) { + die 'Google Big Query query failure: ' . $res->content; + } + + my $result = decode_json($res->content); + + my $row_count = $result->{rows}->[0]->{f}->[0]->{v}; + + # Do not export if we have any rows with this snapshot date. + if ($row_count) { + die "Duplicate data found for snapshot date $snapshot_date\n"; + } +} + +1; diff --git a/extensions/BMO/t/bmo/bmo_etl.t b/extensions/BMO/t/bmo/bmo_etl.t new file mode 100644 index 0000000000..5101833688 --- /dev/null +++ b/extensions/BMO/t/bmo/bmo_etl.t @@ -0,0 +1,136 @@ +#!/usr/bin/env perl +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. +# +# This Source Code Form is "Incompatible With Secondary Licenses", as +# defined by the Mozilla Public License, v. 2.0. + +use 5.10.1; +use strict; +use warnings; +use lib qw(. lib local/lib/perl5 qa/t/lib); + +use Bugzilla; +use Bugzilla::Constants; + +BEGIN { + Bugzilla->extensions; +} + +use Capture::Tiny qw(capture); +use QA::Util qw(get_config); +use MIME::Base64 qw(encode_base64 decode_base64); +use Test::Mojo; +use Test::More; + +my $config = get_config(); +my $admin_login = $config->{admin_user_login}; +my $admin_api_key = $config->{admin_user_api_key}; +my $url = Bugzilla->localconfig->urlbase; + +my $t = Test::Mojo->new(); + +# Allow 1 redirect max +$t->ua->max_redirects(1); + +### Section 1: Create new bug + +my $new_bug_1 = { + product => 'Firefox', + component => 'General', + summary => 'This is a new test bug', + type => 'defect', + version => 'unspecified', + severity => 'blocker', + description => 'This is a new test bug', +}; + +$t->post_ok($url + . 'rest/bug' => {'X-Bugzilla-API-Key' => $admin_api_key} => json => + $new_bug_1)->status_is(200)->json_has('/id'); + +my $bug_id_1 = $t->tx->res->json->{id}; + +### Section 2: Create a new dependent bug + +my $new_bug_2 = { + product => 'Firefox', + component => 'General', + summary => 'This is a new dependent bug', + type => 'defect', + version => 'unspecified', + severity => 'blocker', + description => 'This is a new dependent bug', + depends_on => [$bug_id_1], +}; + +$t->post_ok($url + . 'rest/bug' => {'X-Bugzilla-API-Key' => $admin_api_key} => json => + $new_bug_2)->status_is(200)->json_has('/id'); + +my $bug_id_2 = $t->tx->res->json->{id}; + +### Section 3: Create an attachment + +my $attach_data = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'; + +my $new_attach_1 = { + is_patch => 1, + comment => 'This is a new attachment', + summary => 'Test Attachment', + content_type => 'text/plain', + data => encode_base64($attach_data), + file_name => 'test_attachment.patch', + obsoletes => [], + is_private => 0, +}; + +$t->post_ok($url + . "rest/bug/$bug_id_1/attachment" => + {'X-Bugzilla-API-Key' => $admin_api_key} => json => $new_attach_1)->status_is(201) + ->json_has('/attachments'); + +my ($attach_id) = keys %{$t->tx->res->json->{attachments}}; + +### Section 4: Export data to test files + +my @cmd + = ('perl', 'extensions/BMO/bin/export_bmo_etl.pl', '--verbose', '--test', '--snapshot-date', '2000-01-01'); + +my ($output, $error, $rv) = capture { system @cmd; }; +ok(!$rv, 'Data exported to test files without error'); +ok(glob(bz_locations()->{'datadir'} . '/2000-01-01-bugs-*.json'), 'Export test files exist'); + +### Section 5: Export data to BigQuery test instance + +@cmd = ('perl', 'extensions/BMO/bin/export_bmo_etl.pl', '--verbose', '--snapshot-date', '2000-01-01'); + +($output, $error, $rv) = capture { system @cmd; }; +ok(!$rv, 'Data exported to BigQuery test instance without error'); + +### Section 6: Retrieve data from BigQuery instance and verify + +my $query = {query => 'SELECT summary FROM test.bugzilla.bugs WHERE id = ' . $bug_id_1}; +$t->post_ok( + 'http://bq:9050/bigquery/v2/projects/test/queries' => json => + $query)->status_is(200)->json_is('/rows/0/f/0/v' => $new_bug_1->{summary}); + +$query = {query => 'SELECT description FROM test.bugzilla.attachments WHERE id = ' . $attach_id}; +$t->post_ok( + 'http://bq:9050/bigquery/v2/projects/test/queries' => json => + $query)->status_is(200)->json_is('/rows/0/f/0/v' => $new_attach_1->{summary}); + +$query = {query => 'SELECT depends_on_id FROM test.bugzilla.bug_dependencies WHERE bug_id = ' . $bug_id_2}; +$t->post_ok( + 'http://bq:9050/bigquery/v2/projects/test/queries' => json => + $query)->status_is(200)->json_is('/rows/0/f/0/v' => $bug_id_1); + +### Section 7: Exporting again on the same day (with the same snapshot date) will cause the script to exit + +@cmd = ('perl', 'extensions/BMO/bin/export_bmo_etl.pl', '--verbose', '--snapshot-date', '2000-01-01'); + +($output, $error, $rv) = capture { system @cmd; }; +ok($rv, 'Duplicate data exported to BigQuery test instance should fail'); + +done_testing; diff --git a/extensions/BMO/template/en/default/hook/admin/params/editparams-current_panel.html.tmpl b/extensions/BMO/template/en/default/hook/admin/params/editparams-current_panel.html.tmpl index 5d3226ec5d..61c287700a 100644 --- a/extensions/BMO/template/en/default/hook/admin/params/editparams-current_panel.html.tmpl +++ b/extensions/BMO/template/en/default/hook/admin/params/editparams-current_panel.html.tmpl @@ -18,5 +18,12 @@ ELSIF panel.name == "bugfields"; ELSIF panel.name == "bugchange"; panel.param_descs.enable_triaged_keyword = 'Enforce usage of the "triaged" keyword on selected products.'; +ELSIF panel.name == "reports"; + panel.param_descs.bmo_etl_enabled = 'Enable export to BMO ETL.'; + panel.param_descs.bmo_etl_base_url = 'The base URL for sending BMO ETL data.'; + panel.param_descs.bmo_etl_service_account = 'THe Google service account for accessing the BMO ETL API.'; + panel.param_descs.bmo_etl_project_id = 'The project ID for the BMO ETL data.'; + panel.param_descs.bmo_etl_dataset_id = 'The dataset ID for the BMO ETL data.'; + END; %] From 7d944ee594542cd8496f2e6255390dafec5c5ee7 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Thu, 12 Dec 2024 12:25:19 -0500 Subject: [PATCH 02/12] Delete lock before dying --- extensions/BMO/bin/export_bmo_etl.pl | 35 +++++++++++++++++++++------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/extensions/BMO/bin/export_bmo_etl.pl b/extensions/BMO/bin/export_bmo_etl.pl index 6a2c6a93b3..b1b156471a 100644 --- a/extensions/BMO/bin/export_bmo_etl.pl +++ b/extensions/BMO/bin/export_bmo_etl.pl @@ -527,8 +527,7 @@ send_data($table_name, \@users, $count) if @users; } -# Delete lock from bmo_etl_locked -Bugzilla->dbh_main->do('DELETE FROM bmo_etl_locked'); +delete_lock(); # Functions @@ -546,7 +545,10 @@ sub get_cache { # First uncompress the JSON and then decode it back to Perl data my $data; - gunzip \$gzipped_data => \$data or die "gunzip failed: $GunzipError\n"; + unless (gunzip \$gzipped_data => \$data) { + delete_lock(); + die "gunzip failed: $GunzipError\n"; + } return decode_json($data); } @@ -560,7 +562,10 @@ sub store_cache { # Compress the JSON to save space in the DB my $gzipped_data; - gzip \$data => \$gzipped_data or die "gzip failed: $GzipError\n"; + unless (gzip \$data => \$gzipped_data) { + delete_lock(); + die "gzip failed: $GzipError\n"; + } # We need to use the main DB for write operations my $main_dbh = Bugzilla->dbh_main; @@ -607,7 +612,10 @@ sub send_data { my $fh = path($filename)->open('>>'); print $fh encode_json($big_query) . "\n"; - close $fh || die "Could not close $filename: $!\n"; + unless(close $fh) { + delete_lock(); + die "Could not close $filename: $!\n"; + } return; } @@ -631,7 +639,8 @@ sub send_data { my $res = $ua->request($request); if (!$res->is_success) { - die 'Google Big Query insert failure: ' . $res->content; + delete_lock(); + die 'Google Big Query insert failure: ' . $res->content . "\n"; } } @@ -661,7 +670,8 @@ sub _get_access_token { my $res = $ua->request($request); if (!$res->is_success) { - die 'Google access token failure: ' . $res->content; + delete_lock(); + 'Google access token failure: ' . $res->content; } my $result = decode_json($res->decoded_content); @@ -671,6 +681,8 @@ sub _get_access_token { return $access_token; } +# If a previous process is performing an export to BigQuery, then +# we must check the lock table and exit if true. sub check_and_set_lock { return if $test; # No need if just dumping test files @@ -682,6 +694,11 @@ sub check_and_set_lock { $dbh_main->do('INSERT INTO bmo_etl_locked VALUES (?)', undef, 'locked'); } +# Delete lock from bmo_etl_locked +sub delete_lock { + Bugzilla->dbh_main->do('DELETE FROM bmo_etl_locked'); +} + sub check_for_duplicates { return if $test; # no need if just dumping test files @@ -711,7 +728,8 @@ sub check_for_duplicates { my $res = $ua->request($request); if (!$res->is_success) { - die 'Google Big Query query failure: ' . $res->content; + delete_lock(); + die 'Google Big Query query failure: ' . $res->content . "\n"; } my $result = decode_json($res->content); @@ -720,6 +738,7 @@ sub check_for_duplicates { # Do not export if we have any rows with this snapshot date. if ($row_count) { + delete_lock(); die "Duplicate data found for snapshot date $snapshot_date\n"; } } From ff4ec365dac379f3623a235548940133aba9bba2 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Thu, 12 Dec 2024 16:58:56 -0500 Subject: [PATCH 03/12] Added #! line for export script so webservices-infra can call it using ./extensions/BMO/bin/export_bmo_etl.pl --- extensions/BMO/bin/export_bmo_etl.pl | 1 + extensions/BMO/t/bmo/bmo_etl.t | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/extensions/BMO/bin/export_bmo_etl.pl b/extensions/BMO/bin/export_bmo_etl.pl index b1b156471a..ee0768b40c 100644 --- a/extensions/BMO/bin/export_bmo_etl.pl +++ b/extensions/BMO/bin/export_bmo_etl.pl @@ -1,3 +1,4 @@ +#!/usr/bin/env perl # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. diff --git a/extensions/BMO/t/bmo/bmo_etl.t b/extensions/BMO/t/bmo/bmo_etl.t index 5101833688..ac2f7d1ba8 100644 --- a/extensions/BMO/t/bmo/bmo_etl.t +++ b/extensions/BMO/t/bmo/bmo_etl.t @@ -96,7 +96,7 @@ my ($attach_id) = keys %{$t->tx->res->json->{attachments}}; ### Section 4: Export data to test files my @cmd - = ('perl', 'extensions/BMO/bin/export_bmo_etl.pl', '--verbose', '--test', '--snapshot-date', '2000-01-01'); + = ('./extensions/BMO/bin/export_bmo_etl.pl', '--verbose', '--test', '--snapshot-date', '2000-01-01'); my ($output, $error, $rv) = capture { system @cmd; }; ok(!$rv, 'Data exported to test files without error'); @@ -104,7 +104,7 @@ ok(glob(bz_locations()->{'datadir'} . '/2000-01-01-bugs-*.json'), 'Export test f ### Section 5: Export data to BigQuery test instance -@cmd = ('perl', 'extensions/BMO/bin/export_bmo_etl.pl', '--verbose', '--snapshot-date', '2000-01-01'); +@cmd = ('./extensions/BMO/bin/export_bmo_etl.pl', '--verbose', '--snapshot-date', '2000-01-01'); ($output, $error, $rv) = capture { system @cmd; }; ok(!$rv, 'Data exported to BigQuery test instance without error'); @@ -128,7 +128,7 @@ $t->post_ok( ### Section 7: Exporting again on the same day (with the same snapshot date) will cause the script to exit -@cmd = ('perl', 'extensions/BMO/bin/export_bmo_etl.pl', '--verbose', '--snapshot-date', '2000-01-01'); +@cmd = ('./extensions/BMO/bin/export_bmo_etl.pl', '--verbose', '--snapshot-date', '2000-01-01'); ($output, $error, $rv) = capture { system @cmd; }; ok($rv, 'Duplicate data exported to BigQuery test instance should fail'); From 23fbf4265fd01b6d8fe538b5c26cc9d9bf1adbeb Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Mon, 16 Dec 2024 10:32:13 -0500 Subject: [PATCH 04/12] Extra cleanup and more verbose output --- extensions/BMO/bin/export_bmo_etl.pl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/extensions/BMO/bin/export_bmo_etl.pl b/extensions/BMO/bin/export_bmo_etl.pl index ee0768b40c..7055b5581a 100644 --- a/extensions/BMO/bin/export_bmo_etl.pl +++ b/extensions/BMO/bin/export_bmo_etl.pl @@ -613,7 +613,7 @@ sub send_data { my $fh = path($filename)->open('>>'); print $fh encode_json($big_query) . "\n"; - unless(close $fh) { + unless (close $fh) { delete_lock(); die "Could not close $filename: $!\n"; } @@ -672,7 +672,7 @@ sub _get_access_token { if (!$res->is_success) { delete_lock(); - 'Google access token failure: ' . $res->content; + die 'Google access token failure: ' . $res->content . "\n"; } my $result = decode_json($res->decoded_content); @@ -697,6 +697,7 @@ sub check_and_set_lock { # Delete lock from bmo_etl_locked sub delete_lock { + print "Deleting lock in database\n" if $verbose; Bugzilla->dbh_main->do('DELETE FROM bmo_etl_locked'); } @@ -727,6 +728,8 @@ sub check_for_duplicates { $request->header('Content-Type' => 'application/json'); $request->content(encode_json($query)); + print encode_json($query) . "\n" if $verbose; + my $res = $ua->request($request); if (!$res->is_success) { delete_lock(); From d971acd06c70588ca06de740dedbe68548c0c286 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Wed, 18 Dec 2024 11:17:23 -0500 Subject: [PATCH 05/12] Remove extra bigquery service from docker-compose.yml --- docker-compose.yml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index fb9da1f7e0..f59f4e3786 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -138,18 +138,6 @@ services: - bmo-gcs-sitemapindex:/data/sitemapindex - bmo-gcs-mining:/data/mining - bigquery: - platform: linux/x86_64 - image: ghcr.io/goccy/bigquery-emulator:latest - ports: - - 9050:9050 - volumes: - - bmo-bigquery-data:/work - - ./docker/bigquery/data.yaml:/work/data.yaml - working_dir: /work - command: | - --project=test --data-from-yaml=/work/data.yaml --log-level=debug - externalapi.test: platform: linux/x86_64 build: *bmo_build From d2feaf8bdfbc4d8301c425d988e763de43114c1e Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Fri, 17 Jan 2025 17:53:50 -0500 Subject: [PATCH 06/12] Added some missing functionality and reorganized the code to use functions as suggested. --- docker-compose.yml | 1 - extensions/BMO/bin/export_bmo_etl.pl | 903 +++++++++++------- extensions/BMO/t/bmo/bmo_etl.t | 99 +- .../params/editparams-current_panel.html.tmpl | 2 +- 4 files changed, 633 insertions(+), 372 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index f59f4e3786..2ee79aab43 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -42,7 +42,6 @@ services: - memcached - s3 - gcs - - bigquery - externalapi.test - bq ports: diff --git a/extensions/BMO/bin/export_bmo_etl.pl b/extensions/BMO/bin/export_bmo_etl.pl index 7055b5581a..04ab89a62a 100644 --- a/extensions/BMO/bin/export_bmo_etl.pl +++ b/extensions/BMO/bin/export_bmo_etl.pl @@ -16,12 +16,14 @@ use Bugzilla::Bug; use Bugzilla::Constants; use Bugzilla::Flag; +use Bugzilla::Group; use Bugzilla::User; use HTTP::Headers; use HTTP::Request; use IO::Compress::Gzip qw(gzip $GzipError); use IO::Uncompress::Gunzip qw(gunzip $GunzipError); +use List::Util qw(any); use LWP::UserAgent::Determined; use Mojo::File qw(path); use Mojo::JSON qw(decode_json encode_json false true); @@ -31,8 +33,14 @@ # we will send data in blocks. use constant API_BLOCK_COUNT => 1000; +# Products which we should not send data to ETL such as Legal, etc. +use constant EXCLUDE_PRODUCTS => ('Legal',); + Bugzilla->usage_mode(USAGE_MODE_CMDLINE); -getopt 't|test' => \my $test, 'v|verbose' => \my $verbose, 's|snapshot-date=s' => \my $snapshot_date; +getopt + 't|test' => \my $test, + 'v|verbose' => \my $verbose, + 's|snapshot-date=s' => \my $snapshot_date; # Sanity checks Bugzilla->params->{bmo_etl_enabled} || die "BMO ETL not enabled.\n"; @@ -69,469 +77,635 @@ 'SELECT ' . $dbh->sql_date_format('LOCALTIMESTAMP(0)', '%Y-%m-%d')); } -# In order to avoid entering duplicate data, we will first query BigQuery +# Excluded bugs: List of bug ids that we should not send data for to ETL (i.e. Legal, etc.) +our %excluded_bugs = (); + +# Bugs that are private to one or more groups +our %private_bugs = (); + +# I order to avoid entering duplicate data, we will first query BigQuery # to make sure other entries with this date are not already present. check_for_duplicates(); -### Bugs - -my $table_name = 'bugs'; -my $count = 0; -my $last_id = 0; - -my $total = $dbh->selectrow_array('SELECT COUNT(*) FROM bugs'); -print "Processing $total $table_name.\n" if $verbose; - -my $sth - = $dbh->prepare( - 'SELECT bug_id AS id, delta_ts AS modification_time FROM bugs WHERE bug_id > ? ORDER BY bug_id LIMIT ' - . API_BLOCK_COUNT); - -while ($count < $total) { - my @bugs = (); - - $sth->execute($last_id); - - while (my ($id, $mod_time) = $sth->fetchrow_array()) { - print "Processing id $id with mod_time of $mod_time.\n" if $verbose; - - # First check to see if we have a cached version with the same modification date - my $data = get_cache($id, $table_name, $mod_time); - if (!$data) { - print "$table_name id $id with time $mod_time not found in cache.\n" - if $verbose; - - my $obj = Bugzilla::Bug->new($id); - - $data = { - id => $obj->id, - assignee_id => $obj->assigned_to->id, - url => $obj->bug_file_loc, - severity => $obj->bug_severity, - status => $obj->bug_status, - type => $obj->bug_type, - crash_signature => $obj->cf_crash_signature, - component => $obj->component, - creation_ts => $obj->creation_ts, - updated_ts => $obj->delta_ts, - op_sys => $obj->op_sys, - priority => $obj->priority, - product => $obj->product, - platform => $obj->rep_platform, - reporter_id => $obj->reporter->id, - resolution => $obj->resolution, - summary => $obj->short_desc, - whiteboard => $obj->status_whiteboard, - milestone => $obj->target_milestone, - version => $obj->version, - team_name => $obj->component_obj->team_name, - classification => $obj->classification, - comment_count => $obj->comment_count, - vote_count => $obj->votes, - group => (join ',', map { $_->name } @{$obj->groups_in}), - is_public => (scalar @{$obj->groups_in} ? true : false), - cc_count => scalar @{$obj->cc || []}, - }; - - # Store a copy of the data for use in later executions - store_cache($obj->id, $table_name, $obj->delta_ts, $data); - } +# Process each table to be sent to ETL +process_bugs(); +process_attachments(); +process_flags(); +process_flag_state_activity(); +process_tracking_flags(); +proccess_keywords(); +process_see_also(); +process_mentors(); +process_dependencies(); +process_regressions(); +process_duplicates(); +process_users(); + +# If we are done, remove the lock +delete_lock(); - push @bugs, $data; +### Functions + +sub process_bugs { + my $table_name = 'bugs'; + my $count = 0; + my $last_id = 0; + + my $total = $dbh->selectrow_array('SELECT COUNT(*) FROM bugs'); + print "Processing $total $table_name.\n" if $verbose; + + my $sth + = $dbh->prepare( + 'SELECT bug_id AS id, delta_ts AS modification_time FROM bugs WHERE bug_id > ? ORDER BY bug_id LIMIT ' + . API_BLOCK_COUNT); + + while ($count < $total) { + my @bugs = (); + + $sth->execute($last_id); + + while (my ($id, $mod_time) = $sth->fetchrow_array()) { + print "Processing id $id with mod_time of $mod_time.\n" if $verbose; + + # First check to see if we have a cached version with the same modification date + my $data = get_cache($id, $table_name, $mod_time); + if (!$data) { + print "$table_name id $id with time $mod_time not found in cache.\n" + if $verbose; + + my $obj = Bugzilla::Bug->new($id); + + my $bug_is_private = scalar @{$obj->groups_in}; + + if (any { $obj->product eq $_ } EXCLUDE_PRODUCTS) { + $excluded_bugs{$obj->id} = 1; + next; + } + + $private_bugs{$obj->id} = 1 if $bug_is_private; + + # Standard non-sensitive fields + $data = { + id => $obj->id, + status => $obj->bug_status, + type => $obj->bug_type, + component => $obj->component, + creation_ts => $obj->creation_ts, + updated_ts => $obj->delta_ts, + op_sys => $obj->op_sys, + product => $obj->product, + platform => $obj->rep_platform, + reporter_id => $obj->reporter->id, + version => $obj->version, + team_name => $obj->component_obj->team_name, + classification => $obj->classification, + comment_count => $obj->comment_count, + vote_count => $obj->votes, + }; + + # Fields that require custom values based on criteria + $data->{assignee_id} + = $obj->assigned_to->login ne 'nobody@mozilla.org' + ? $obj->assigned_to->id + : undef; + $data->{url} + = (!$bug_is_private && $obj->bug_file_loc) ? $obj->bug_file_loc : undef; + $data->{severity} = $obj->bug_severity ne '--' ? $obj->bug_severity : undef; + $data->{crash_signature} + = (!$bug_is_private && $obj->cf_crash_signature) + ? $obj->cf_crash_signature + : undef; + $data->{priority} = $obj->priority ne '--' ? $obj->priority : undef; + $data->{resolution} = $obj->resolution ? $obj->resolution : undef; + $data->{summary} = !$bug_is_private ? $obj->short_desc : undef; + $data->{whiteboard} + = (!$bug_is_private && $obj->status_whiteboard) + ? $obj->status_whiteboard + : undef; + $data->{milestone} + = $obj->target_milestone ne '---' ? $obj->target_milestone : undef; + $data->{is_public} = $bug_is_private ? true : false; + $data->{cc_count} = scalar @{$obj->cc || []}; + + # If more than one group, then pick the one with the least of amount of members + if (!$bug_is_private) { + $data->{group} = undef; + } + elsif (scalar @{$obj->groups_in} == 1) { + my $groups = $obj->groups_in; + $data->{group} = $groups->[0]->name; + } + else { + $data->{group} = get_multi_group_value($obj); + } + + # Store a copy of the data for use in later executions + store_cache($obj->id, $table_name, $obj->delta_ts, $data); + } + + push @bugs, $data; + + $count++; + $last_id = $id; + } - $count++; - $last_id = $id; + # Send the rows to the server + send_data($table_name, \@bugs, $count) if @bugs; } - - # Send the rows to the server - send_data($table_name, \@bugs, $count) if @bugs; } -### Attachments +sub process_attachments { + my $table_name = 'attachments'; + my $count = 0; + my $last_id = 0; -$table_name = 'attachments'; -$count = 0; -$last_id = 0; + my $total = $dbh->selectrow_array('SELECT COUNT(*) FROM attachments'); + print "Processing $total $table_name.\n" if $verbose; -$total = $dbh->selectrow_array('SELECT COUNT(*) FROM attachments'); -print "Processing $total $table_name.\n" if $verbose; + my $sth + = $dbh->prepare( + 'SELECT attach_id AS id, modification_time FROM attachments WHERE attach_id > ? ORDER BY attach_id LIMIT ' + . API_BLOCK_COUNT); -$sth - = $dbh->prepare( - 'SELECT attach_id AS id, modification_time FROM attachments WHERE attach_id > ? ORDER BY bug_id LIMIT ' - . API_BLOCK_COUNT); + while ($count < $total) { + my @attachments = (); -while ($count < $total) { - my @attachments = (); + $sth->execute($last_id); - $sth->execute($last_id); + while (my ($id, $mod_time) = $sth->fetchrow_array()) { + print "Processing id $id with mod_time of $mod_time.\n" if $verbose; - while (my ($id, $mod_time) = $sth->fetchrow_array()) { - print "Processing id $id with mod_time of $mod_time.\n" if $verbose; + # First check to see if we have a cached version with the same modification date + my $data = get_cache($id, $table_name, $mod_time); + if (!$data) { + print "$table_name id $id with time $mod_time not found in cache.\n" + if $verbose; - # First check to see if we have a cached version with the same modification date - my $data = get_cache($id, $table_name, $mod_time); - if (!$data) { - print "$table_name id $id with time $mod_time not found in cache.\n" - if $verbose; + my $obj = Bugzilla::Attachment->new($id); - my $obj = Bugzilla::Attachment->new($id); + next if $excluded_bugs{$obj->bug_id}; - $data = { - id => $obj->id, - bug_id => $obj->bug_id, - creation_ts => $obj->attached, - description => $obj->description, - filename => $obj->filename, - content_type => $obj->contenttype, - updated_ts => $obj->modification_time, - submitter_id => $obj->attacher->id, - is_obsolete => ($obj->isobsolete ? true : false), - }; + # Standard non-sensitive fields + $data = { + id => $obj->id, + bug_id => $obj->bug_id, + creation_ts => $obj->attached, + content_type => $obj->contenttype, + updated_ts => $obj->modification_time, + submitter_id => $obj->attacher->id, + is_obsolete => ($obj->isobsolete ? true : false), + }; - # Store a new copy of the data for use later - store_cache($obj->id, $table_name, $obj->modification_time, $data); - } + # Fields that require custom values based on criteria + my $bug_is_private = exists $private_bugs{$obj->bug_id}; + $data->{description} = !$bug_is_private ? $obj->description : undef; + $data->{filename} = !$bug_is_private ? $obj->filename : undef; - push @attachments, $data; + # Store a new copy of the data for use later + store_cache($obj->id, $table_name, $obj->modification_time, $data); + } - $count++; - $last_id = $id; - } + push @attachments, $data; + + $count++; + $last_id = $id; + } - # Send the rows to the server - send_data($table_name, \@attachments, $count) if @attachments; + # Send the rows to the server + send_data($table_name, \@attachments, $count) if @attachments; + } } -### Flags +sub process_flags { + my $table_name = 'flags'; + my $count = 0; + my $last_id = 0; -$table_name = 'flags'; -$count = 0; -$last_id = 0; + my $total = $dbh->selectrow_array('SELECT COUNT(*) FROM flags'); + print "Processing $total $table_name.\n" if $verbose; -$total = $dbh->selectrow_array('SELECT COUNT(*) FROM flags'); -print "Processing $total $table_name.\n" if $verbose; + my $sth + = $dbh->prepare( + 'SELECT id, modification_date FROM flags WHERE id > ? ORDER BY id LIMIT ' + . API_BLOCK_COUNT); -$sth - = $dbh->prepare( - 'SELECT id, modification_date FROM flags WHERE id > ? ORDER BY id LIMIT ' - . API_BLOCK_COUNT); + while ($count < $total) { + my @flags = (); -while ($count < $total) { - my @flags = (); + $sth->execute($last_id); - $sth->execute($last_id); + while (my ($id, $mod_time) = $sth->fetchrow_array()) { + print "Processing id $id with mod_time of $mod_time.\n" if $verbose; - while (my ($id, $mod_time) = $sth->fetchrow_array()) { - print "Processing id $id with mod_time of $mod_time.\n" if $verbose; + # First check to see if we have a cached version with the same modification date + my $data = get_cache($id, $table_name, $mod_time); + if (!$data) { + print "$table_name id $id with time $mod_time not found in cache.\n" + if $verbose; - # First check to see if we have a cached version with the same modification date - my $data = get_cache($id, $table_name, $mod_time); - if (!$data) { - print "$table_name id $id with time $mod_time not found in cache.\n" - if $verbose; + my $obj = Bugzilla::Flag->new($id); - my $obj = Bugzilla::Flag->new($id); + next if $excluded_bugs{$obj->bug_id}; - $data = { - attachment_id => $obj->attach_id, - bug_id => $obj->bug_id, - creation_ts => $obj->creation_date, - updated_ts => $obj->modification_date, - requestee_id => $obj->requestee_id, - setter_id => $obj->setter_id, - name => $obj->type->name, - value => $obj->status, - }; + $data = { + attachment_id => $obj->attach_id || undef, + bug_id => $obj->bug_id, + creation_ts => $obj->creation_date, + updated_ts => $obj->modification_date, + requestee_id => $obj->requestee_id, + setter_id => $obj->setter_id, + name => $obj->type->name, + value => $obj->status, + }; - # Store a new copy of the data for use later - store_cache($obj->id, $table_name, $obj->modification_date, $data); - } + # Store a new copy of the data for use later + store_cache($obj->id, $table_name, $obj->modification_date, $data); + } - push @flags, $data; + push @flags, $data; - $count++; - $last_id = $id; + $count++; + $last_id = $id; + } + + # Send the rows to the server + send_data($table_name, \@flags, $count) if @flags; } +} + +sub process_flag_state_activity { + # Process flags that were removed today using the flag_state_activity table + # These entries will also go into the flags table in BigQuery. + my $table_name = 'flag_state_activity'; + my $count = 0; + my $last_id = 0; + + my $total + = $dbh->selectrow_array( + 'SELECT COUNT(*) FROM flag_state_activity WHERE status = \'X\' AND flag_when LIKE \'' + . $snapshot_date . ' %\''); + print "Processing $total $table_name.\n" if $verbose; + + my $sth + = $dbh->prepare( + 'SELECT id, flag_when FROM flag_state_activity WHERE id > ? AND status = \'X\' AND flag_when LIKE \'' + . $snapshot_date . ' %\' ORDER BY id LIMIT ' + . API_BLOCK_COUNT); + + while ($count < $total) { + my @flags = (); + + $sth->execute($last_id); + + while (my ($id, $mod_time) = $sth->fetchrow_array()) { + print "Processing id $id with mod_time of $mod_time.\n" if $verbose; + + # First check to see if we have a cached version with the same modification date + my $data = get_cache($id, $table_name, $mod_time); + if (!$data) { + print "$table_name id $id with time $mod_time not found in cache.\n" + if $verbose; + + my $obj = Bugzilla::Extension::Review::FlagStateActivity->new($id); + + next if $excluded_bugs{$obj->bug_id}; + + $data = { + attachment_id => $obj->attachment_id || undef, + bug_id => $obj->bug_id, + creation_ts => $obj->flag_when, + updated_ts => $obj->flag_when, + requestee_id => $obj->requestee_id, + setter_id => $obj->setter_id, + name => $obj->type->name, + value => $obj->status, + }; + + # Store a new copy of the data for use later + store_cache($obj->id, $table_name, $obj->modification_date, $data); + } + + push @flags, $data; + + $count++; + $last_id = $id; + } - # Send the rows to the server - send_data($table_name, \@flags, $count) if @flags; + # Send the rows to the server + send_data('flags', \@flags, $count) if @flags; + } } -### Tracking Flags +sub process_tracking_flags { + my $table_name = 'tracking_flags'; + my $rows = $dbh->selectall_arrayref( + 'SELECT tracking_flags.name AS name, tracking_flags_bugs.bug_id AS bug_id, tracking_flags_bugs.value AS value + FROM tracking_flags_bugs + JOIN tracking_flags + ON tracking_flags_bugs.tracking_flag_id = tracking_flags.id + ORDER BY tracking_flags_bugs.bug_id', {Slice => {}} + ); -$table_name = 'tracking_flags'; -my $rows = $dbh->selectall_arrayref( - 'SELECT tracking_flags.name AS name, tracking_flags_bugs.bug_id AS bug_id, tracking_flags_bugs.value AS value - FROM tracking_flags_bugs - JOIN tracking_flags - ON tracking_flags_bugs.tracking_flag_id = tracking_flags.id - ORDER BY tracking_flags_bugs.bug_id', {Slice => {}} -); + my $total = scalar @{$rows}; + my $count = 0; -$total = scalar @{$rows}; -$count = 0; + print "Processing $total $table_name.\n" if $verbose; -print "Processing $total $table_name.\n" if $verbose; + my @results = (); + foreach my $row (@{$rows}) { + next if $excluded_bugs{$row->{bug_id}}; -my @results = (); -foreach my $row (@{$rows}) { - my $data - = {bug_id => $row->{bug_id}, name => $row->{name}, value => $row->{value},}; + # Standard fields + my $data = {bug_id => $row->{bug_id}}; - push @results, $data; + # Fields that require custom values based on other criteria + if (exists $private_bugs{$row->{bug_id}}) { + $data->{name} = undef; + $data->{value} = undef; + } + else { + $data->{name} = $row->{name}; + $data->{value} = $row->{value}; + } - $count++; + push @results, $data; + + $count++; - # Send the rows to the server if we have a specific sized block' - # or we are at the last row - if (scalar @results == API_BLOCK_COUNT || $total == $count) { - send_data($table_name, \@results, $count); - @results = (); + # Send the rows to the server if we have a specific sized block' + # or we are at the last row + if (scalar @results == API_BLOCK_COUNT || $total == $count) { + send_data($table_name, \@results, $count); + @results = (); + } } } -### Keywords +sub proccess_keywords { + my $table_name = 'keywords'; + my $rows = $dbh->selectall_arrayref( + 'SELECT bug_id, keyworddefs.name AS name + FROM keywords + JOIN keyworddefs + ON keywords.keywordid = keyworddefs.id + ORDER BY bug_id', {Slice => {}} + ); + + my $total = scalar @{$rows}; + my $count = 0; -$table_name = 'keywords'; -$rows = $dbh->selectall_arrayref( - 'SELECT bug_id, keyworddefs.name AS name - FROM keywords - JOIN keyworddefs - ON keywords.keywordid = keyworddefs.id - ORDER BY bug_id', {Slice => {}} -); + print "Processing $total $table_name.\n" if $verbose; -$total = scalar @{$rows}; -$count = 0; + my @results = (); + foreach my $row (@{$rows}) { + next if $excluded_bugs{$row->{bug_id}}; -print "Processing $total $table_name.\n" if $verbose; + # Standard fields + my $data = {bug_id => $row->{bug_id}}; -@results = (); -foreach my $row (@{$rows}) { - my $data = {bug_id => $row->{bug_id}, keyword => $row->{name},}; + # Fields that require custom values based on other criteria + $data->{keyword} = !exists $private_bugs{$row->{bug_id}} ? $row->{name} : undef; - push @results, $data; + push @results, $data; - $count++; + $count++; - # Send the rows to the server if we have a specific sized block' - # or we are at the last row - if (scalar @results == API_BLOCK_COUNT || $total == $count) { - send_data($table_name, \@results, $count); - @results = (); + # Send the rows to the server if we have a specific sized block' + # or we are at the last row + if (scalar @results == API_BLOCK_COUNT || $total == $count) { + send_data($table_name, \@results, $count); + @results = (); + } } } -### See Also +sub process_see_also { + my $table_name = 'see_also'; + my $rows + = $dbh->selectall_arrayref( + 'SELECT bug_id, value, class FROM bug_see_also ORDER BY bug_id', + {Slice => {}}); -$table_name = 'see_also'; -$rows - = $dbh->selectall_arrayref( - 'SELECT bug_id, value, class FROM bug_see_also ORDER BY bug_id', - {Slice => {}}); + my $total = scalar @{$rows}; + my $count = 0; -$total = scalar @{$rows}; -$count = 0; + print "Processing $total $table_name.\n" if $verbose; -print "Processing $total $table_name.\n" if $verbose; + my @results = (); + foreach my $row (@{$rows}) { + next if $excluded_bugs{$row->{bug_id}}; -@results = (); -foreach my $row (@{$rows}) { - my $data = {bug_id => $row->{bug_id},}; - if ($row->{class} =~ /::Local/) { - $data->{url} - = Bugzilla->localconfig->urlbase . 'show_bug.cgi?id=' . $row->{value}; - } - else { - $data->{url} = $row->{value}; - } + # Standard fields + my $data = {bug_id => $row->{bug_id},}; - push @results, $data; + # Fields that require custom values based on other criteria + if ($private_bugs{$row->{bug_id}}) { + $data->{url} = undef; + } + else { + if ($row->{class} =~ /::Local/) { + $data->{url} + = Bugzilla->localconfig->urlbase . 'show_bug.cgi?id=' . $row->{value}; + } + else { + $data->{url} = $row->{value}; + } + } - $count++; + push @results, $data; + + $count++; - # Send the rows to the server if we have a specific sized block' - # or we are at the last row - if (scalar @results == API_BLOCK_COUNT || $total == $count) { - send_data($table_name, \@results, $count); - @results = (); + # Send the rows to the server if we have a specific sized block' + # or we are at the last row + if (scalar @results == API_BLOCK_COUNT || $total == $count) { + send_data($table_name, \@results, $count); + @results = (); + } } } -### Mentors +sub process_mentors { + my $table_name = 'bug_mentors'; + my $rows + = $dbh->selectall_arrayref( + 'SELECT bug_id, user_id FROM bug_mentors ORDER BY bug_id', + {Slice => {}}); -$table_name = 'bug_mentors'; -$rows - = $dbh->selectall_arrayref( - 'SELECT bug_id, user_id FROM bug_mentors ORDER BY bug_id', - {Slice => {}}); + my $total = scalar @{$rows}; + my $count = 0; -$total = scalar @{$rows}; -$count = 0; + print "Processing $total $table_name.\n" if $verbose; -print "Processing $total $table_name.\n" if $verbose; + my @results = (); + foreach my $row (@{$rows}) { + next if $excluded_bugs{$row->{bug_id}}; -@results = (); -foreach my $row (@{$rows}) { - my $data = {bug_id => $row->{bug_id}, user_id => $row->{user_id}}; + my $data = {bug_id => $row->{bug_id}, user_id => $row->{user_id}}; - push @results, $data; + push @results, $data; - $count++; + $count++; - # Send the rows to the server if we have a specific sized block' - # or we are at the last row - if (scalar @results == API_BLOCK_COUNT || $total == $count) { - send_data($table_name, \@results, $count); - @results = (); + # Send the rows to the server if we have a specific sized block' + # or we are at the last row + if (scalar @results == API_BLOCK_COUNT || $total == $count) { + send_data($table_name, \@results, $count); + @results = (); + } } } -### Dependencies +sub process_dependencies { + my $table_name = 'bug_dependencies'; + my $rows + = $dbh->selectall_arrayref( + 'SELECT blocked, dependson FROM dependencies ORDER BY blocked', + {Slice => {}}); -$table_name = 'bug_dependencies'; -$rows - = $dbh->selectall_arrayref( - 'SELECT blocked, dependson FROM dependencies ORDER BY blocked', - {Slice => {}}); + my $total = scalar @{$rows}; + my $count = 0; -$total = scalar @{$rows}; -$count = 0; + print "Processing $total $table_name.\n" if $verbose; -print "Processing $total $table_name.\n" if $verbose; + my @results = (); + foreach my $row (@{$rows}) { + next if $excluded_bugs{$row->{blocked}}; -@results = (); -foreach my $row (@{$rows}) { - my $data = {bug_id => $row->{blocked}, depends_on_id => $row->{dependson}}; + my $data = {bug_id => $row->{blocked}, depends_on_id => $row->{dependson}}; - push @results, $data; + push @results, $data; - $count++; + $count++; - # Send the rows to the server if we have a specific sized block' - # or we are at the last row - if (scalar @results == API_BLOCK_COUNT || $total == $count) { - send_data($table_name, \@results, $count); - @results = (); + # Send the rows to the server if we have a specific sized block' + # or we are at the last row + if (scalar @results == API_BLOCK_COUNT || $total == $count) { + send_data($table_name, \@results, $count); + @results = (); + } } } -### Regressions +sub process_regressions { + my $table_name = 'bug_regressions'; + my $rows + = $dbh->selectall_arrayref('SELECT regresses, regressed_by FROM regressions', + {Slice => {}}); -$table_name = 'bug_regressions'; -$rows - = $dbh->selectall_arrayref('SELECT regresses, regressed_by FROM regressions', - {Slice => {}}); + my $total = scalar @{$rows}; + my $count = 0; -$total = scalar @{$rows}; -$count = 0; + print "Processing $total $table_name.\n" if $verbose; -print "Processing $total $table_name.\n" if $verbose; + my @results = (); + foreach my $row (@{$rows}) { + next if $excluded_bugs{$row->{regresses}}; -@results = (); -foreach my $row (@{$rows}) { - my $data = {bug_id => $row->{regresses}, regresses_id => $row->{regressed_by},}; + my $data = {bug_id => $row->{regresses}, regresses_id => $row->{regressed_by},}; - push @results, $data; + push @results, $data; - $count++; + $count++; - # Send the rows to the server if we have a specific sized block' - # or we are at the last row - if (scalar @results == API_BLOCK_COUNT || $total == $count) { - send_data($table_name, \@results, $count); - @results = (); + # Send the rows to the server if we have a specific sized block + # or we are at the last row + if (scalar @results == API_BLOCK_COUNT || $total == $count) { + send_data($table_name, \@results, $count); + @results = (); + } } } -### Duplicates +sub process_duplicates { + my $table_name = 'bug_duplicates'; + my $rows = $dbh->selectall_arrayref('SELECT dupe, dupe_of FROM duplicates', + {Slice => {}}); -$table_name = 'bug_duplicates'; -$rows = $dbh->selectall_arrayref('SELECT dupe, dupe_of FROM duplicates', - {Slice => {}}); + my $total = scalar @{$rows}; + my $count = 0; -$total = scalar @{$rows}; -$count = 0; + print "Processing $total $table_name.\n" if $verbose; -print "Processing $total $table_name.\n" if $verbose; + my @results = (); + foreach my $row (@{$rows}) { + next if $excluded_bugs{$row->{dupe}}; -@results = (); -foreach my $row (@{$rows}) { - my $data = {bug_id => $row->{dupe}, duplicate_of_id => $row->{dupe_of},}; + my $data = {bug_id => $row->{dupe}, duplicate_of_id => $row->{dupe_of},}; - push @results, $data; + push @results, $data; - $count++; + $count++; - # Send the rows to the server if we have a specific sized block' - # or we are at the last row - if (scalar @results == API_BLOCK_COUNT || $total == $count) { - send_data($table_name, \@results, $count); - @results = (); + # Send the rows to the server if we have a specific sized block' + # or we are at the last row + if (scalar @results == API_BLOCK_COUNT || $total == $count) { + send_data($table_name, \@results, $count); + @results = (); + } } } -### Users +sub process_users { + my $table_name = 'users'; + my $count = 0; + my $last_id = 0; -$table_name = 'users'; -$count = 0; -$last_id = 0; + my $total = $dbh->selectrow_array('SELECT COUNT(*) FROM profiles'); + print "Processing $total $table_name.\n" if $verbose; -$total = $dbh->selectrow_array('SELECT COUNT(*) FROM profiles'); -print "Processing $total $table_name.\n" if $verbose; + my $sth + = $dbh->prepare( + 'SELECT userid, modification_ts FROM profiles WHERE userid > ? ORDER BY userid LIMIT ' + . API_BLOCK_COUNT); -$sth - = $dbh->prepare( - 'SELECT userid, modification_ts FROM profiles WHERE userid > ? ORDER BY userid LIMIT ' - . API_BLOCK_COUNT); + while ($count < $total) { + my @users = (); -while ($count < $total) { - my @users = (); + $sth->execute($last_id); - $sth->execute($last_id); + while (my ($id, $mod_time) = $sth->fetchrow_array()) { + print "Processing id $id with mod_time of $mod_time.\n" if $verbose; - while (my ($id, $mod_time) = $sth->fetchrow_array()) { - print "Processing id $id with mod_time of $mod_time.\n" if $verbose; + # First check to see if we have a cached version with the same modification date + my $data = get_cache($id, $table_name, $mod_time); + if (!$data) { + print "$table_name id $id with time $mod_time not found in cache.\n" + if $verbose; - # First check to see if we have a cached version with the same modification date - my $data = get_cache($id, $table_name, $mod_time); - if (!$data) { - print "$table_name id $id with time $mod_time not found in cache.\n" - if $verbose; + my $obj = Bugzilla::User->new($id); - my $obj = Bugzilla::User->new($id); + # Standard fields + $data = { + id => $obj->id, + last_seen => $obj->last_seen_date, + email => $obj->email, + is_new => $obj->is_new, + }; - $data = { - id => $obj->id, - last_seen => $obj->last_seen_date, - email => $obj->email, - nick => $obj->nick, - name => $obj->name, - ldap_email => $obj->ldap_email, - is_new => $obj->is_new, - is_staff => ($obj->in_group('mozilla-employee-confidential') ? true : false), - is_trusted => ($obj->in_group('editbugs') ? true : false), - }; + # Fields that require custom values based on criteria + $data->{nick} = $obj->nick ? $obj->nick : undef; + $data->{name} = $obj->name ? $obj->name : undef; + $data->{is_staff} + = $obj->in_group('mozilla-employee-confidential') ? true : false; + $data->{is_trusted} = $obj->in_group('editbugs') ? true : false; + $data->{ldap_email} = $obj->ldap_email ? $obj->ldap_email : undef; - # Store a new copy of the data for use later - store_cache($obj->id, $table_name, $obj->modification_ts, $data); - } + # Store a new copy of the data for use later + store_cache($obj->id, $table_name, $obj->modification_ts, $data); + } - push @users, $data; + push @users, $data; - $count++; - $last_id = $id; - } + $count++; + $last_id = $id; + } - # Send the rows to the server - send_data($table_name, \@users, $count) if @users; + # Send the rows to the server + send_data($table_name, \@users, $count) if @users; + } } -delete_lock(); - -# Functions - sub get_cache { my ($id, $table, $timestamp) = @_; @@ -638,10 +812,17 @@ sub send_data { $request->header('Content-Type' => 'application/json'); $request->content(encode_json($big_query)); - my $res = $ua->request($request); - if (!$res->is_success) { + my $response = $ua->request($request); + my $result = decode_json($response->content); + + if (!$response->is_success + || (exists $result->{insertErrors} && @{$result->{insertErrors}})) + { delete_lock(); - die 'Google Big Query insert failure: ' . $res->content . "\n"; + die "Google Big Query insert failure:\nRequest:\n" + . $request->content + . "\n\nResponse:\n" + . $response->content . "\n"; } } @@ -655,7 +836,7 @@ sub _get_access_token { } # Google Kubernetes allows for the use of Workload Identity. This allows -# us to link two serice accounts together and give special access for applications +# us to link two service accounts together and give special access for applications # running under Kubernetes. We use the special access to get an OAuth2 access_token # that can then be used for accessing the the Google API such as BigQuery. my $url @@ -685,7 +866,7 @@ sub _get_access_token { # If a previous process is performing an export to BigQuery, then # we must check the lock table and exit if true. sub check_and_set_lock { - return if $test; # No need if just dumping test files + return if $test; # No need if just dumping test files my $dbh_main = Bugzilla->dbh_main; my $locked = $dbh_main->selectrow_array('SELECT COUNT(*) FROM bmo_etl_locked'); @@ -702,7 +883,7 @@ sub delete_lock { } sub check_for_duplicates { - return if $test; # no need if just dumping test files + return if $test; # no need if just dumping test files print "Checking for duplicate data for snapshot date $snapshot_date\n" if $verbose; @@ -719,10 +900,11 @@ sub check_for_duplicates { print "Querying $base_url/$full_path\n" if $verbose; - my $query - = {query => - "SELECT count(*) FROM ${project_id}.${dataset_id}.bugs WHERE snapshot_date = '$snapshot_date'" - }; + my $query = { + query => + "SELECT count(*) FROM ${project_id}.${dataset_id}.bugs WHERE snapshot_date = '$snapshot_date';", + useLegacySql => false, + }; my $request = HTTP::Request->new('POST', "$base_url/$full_path", $http_headers); $request->header('Content-Type' => 'application/json'); @@ -747,4 +929,25 @@ sub check_for_duplicates { } } +sub get_multi_group_value { + my ($bug) = @_; + + my $smallest_group_name = undef; + my $smallest_group_count = 0; + + foreach my $group (@{$bug->groups_in}) { + my $user_count = 0; + my $member_data = $group->members_complete; + foreach my $type (keys %{$member_data}) { + $user_count += scalar @{$member_data->{$type}}; + } + if ($user_count < $smallest_group_count) { + $smallest_group_count = $user_count; + $smallest_group_name = $group->name; + } + } + + return $smallest_group_name; +} + 1; diff --git a/extensions/BMO/t/bmo/bmo_etl.t b/extensions/BMO/t/bmo/bmo_etl.t index ac2f7d1ba8..fc040db9d7 100644 --- a/extensions/BMO/t/bmo/bmo_etl.t +++ b/extensions/BMO/t/bmo/bmo_etl.t @@ -21,13 +21,15 @@ BEGIN { use Capture::Tiny qw(capture); use QA::Util qw(get_config); use MIME::Base64 qw(encode_base64 decode_base64); +use Mojo::JSON qw(false); use Test::Mojo; use Test::More; -my $config = get_config(); -my $admin_login = $config->{admin_user_login}; -my $admin_api_key = $config->{admin_user_api_key}; -my $url = Bugzilla->localconfig->urlbase; +my $config = get_config(); +my $admin_login = $config->{admin_user_login}; +my $admin_api_key = $config->{admin_user_api_key}; +my $editbugs_api_key = $config->{editbugs_user_api_key}; +my $url = Bugzilla->localconfig->urlbase; my $t = Test::Mojo->new(); @@ -44,6 +46,11 @@ my $new_bug_1 = { version => 'unspecified', severity => 'blocker', description => 'This is a new test bug', + flags => [{ + name => 'needinfo', + status => '?', + requestee => $config->{editbugs_user_login}, + }], }; $t->post_ok($url @@ -52,6 +59,11 @@ $t->post_ok($url my $bug_id_1 = $t->tx->res->json->{id}; +# Clear the needinfo which will add an X entry in flag_state_activity +$t->put_ok($url + . "rest/bug/$bug_id_1" => {'X-Bugzilla-API-Key' => $editbugs_api_key} => + json => {comment => {body => 'This is my comment'}})->status_is(200); + ### Section 2: Create a new dependent bug my $new_bug_2 = { @@ -84,51 +96,98 @@ my $new_attach_1 = { file_name => 'test_attachment.patch', obsoletes => [], is_private => 0, + flags => [{ + name => 'review', + status => '?', + requestee => $config->{editbugs_user_login}, + }], }; $t->post_ok($url . "rest/bug/$bug_id_1/attachment" => - {'X-Bugzilla-API-Key' => $admin_api_key} => json => $new_attach_1)->status_is(201) - ->json_has('/attachments'); + {'X-Bugzilla-API-Key' => $admin_api_key} => json => $new_attach_1) + ->status_is(201)->json_has('/attachments'); my ($attach_id) = keys %{$t->tx->res->json->{attachments}}; ### Section 4: Export data to test files -my @cmd - = ('./extensions/BMO/bin/export_bmo_etl.pl', '--verbose', '--test', '--snapshot-date', '2000-01-01'); +my @cmd = ( + './extensions/BMO/bin/export_bmo_etl.pl', + '--verbose', '--test', '--snapshot-date', '2000-01-01' +); my ($output, $error, $rv) = capture { system @cmd; }; ok(!$rv, 'Data exported to test files without error'); -ok(glob(bz_locations()->{'datadir'} . '/2000-01-01-bugs-*.json'), 'Export test files exist'); +ok(glob(bz_locations()->{'datadir'} . '/2000-01-01-bugs-*.json'), + 'Export test files exist'); ### Section 5: Export data to BigQuery test instance -@cmd = ('./extensions/BMO/bin/export_bmo_etl.pl', '--verbose', '--snapshot-date', '2000-01-01'); +@cmd = ( + './extensions/BMO/bin/export_bmo_etl.pl', '--verbose', + '--snapshot-date', '2000-01-01' +); ($output, $error, $rv) = capture { system @cmd; }; ok(!$rv, 'Data exported to BigQuery test instance without error'); ### Section 6: Retrieve data from BigQuery instance and verify -my $query = {query => 'SELECT summary FROM test.bugzilla.bugs WHERE id = ' . $bug_id_1}; +my $query = { + query => 'SELECT summary FROM test.bugzilla.bugs WHERE id = ' . $bug_id_1 . ';', + useLegacySql => false +}; $t->post_ok( - 'http://bq:9050/bigquery/v2/projects/test/queries' => json => - $query)->status_is(200)->json_is('/rows/0/f/0/v' => $new_bug_1->{summary}); + 'http://bq:9050/bigquery/v2/projects/test/queries' => json => $query) + ->status_is(200)->json_is('/rows/0/f/0/v' => $new_bug_1->{summary}); -$query = {query => 'SELECT description FROM test.bugzilla.attachments WHERE id = ' . $attach_id}; +$query = { + query => 'SELECT description FROM test.bugzilla.attachments WHERE id = ' + . $attach_id . ';', + useLegacySql => false +}; +$t->post_ok( + 'http://bq:9050/bigquery/v2/projects/test/queries' => json => $query) + ->status_is(200)->json_is('/rows/0/f/0/v' => $new_attach_1->{summary}); + +$query = { + query => + 'SELECT depends_on_id FROM test.bugzilla.bug_dependencies WHERE bug_id = ' + . $bug_id_2 . ';', + useLegacySql => false +}; $t->post_ok( - 'http://bq:9050/bigquery/v2/projects/test/queries' => json => - $query)->status_is(200)->json_is('/rows/0/f/0/v' => $new_attach_1->{summary}); + 'http://bq:9050/bigquery/v2/projects/test/queries' => json => $query) + ->status_is(200)->json_is('/rows/0/f/0/v' => $bug_id_1); + +$query = { + query => 'SELECT bug_id FROM test.bugzilla.flags WHERE bug_id = ' + . $bug_id_1 . " AND name = 'needinfo';", + useLegacySql => false +}; +$t->post_ok( + 'http://bq:9050/bigquery/v2/projects/test/queries' => json => $query) + ->status_is(200)->json_is('/rows/0/f/0/v' => $bug_id_1); + +$query = { + query => 'SELECT bug_id FROM test.bugzilla.flags WHERE bug_id = ' + . $bug_id_1 + . " AND name = 'review' AND attachment_id = " + . $attach_id . ';', + useLegacySql => false +}; -$query = {query => 'SELECT depends_on_id FROM test.bugzilla.bug_dependencies WHERE bug_id = ' . $bug_id_2}; $t->post_ok( - 'http://bq:9050/bigquery/v2/projects/test/queries' => json => - $query)->status_is(200)->json_is('/rows/0/f/0/v' => $bug_id_1); + 'http://bq:9050/bigquery/v2/projects/test/queries' => json => $query) + ->status_is(200)->json_is('/rows/0/f/0/v' => $bug_id_1); ### Section 7: Exporting again on the same day (with the same snapshot date) will cause the script to exit -@cmd = ('./extensions/BMO/bin/export_bmo_etl.pl', '--verbose', '--snapshot-date', '2000-01-01'); +@cmd = ( + './extensions/BMO/bin/export_bmo_etl.pl', '--verbose', + '--snapshot-date', '2000-01-01', +); ($output, $error, $rv) = capture { system @cmd; }; ok($rv, 'Duplicate data exported to BigQuery test instance should fail'); diff --git a/extensions/BMO/template/en/default/hook/admin/params/editparams-current_panel.html.tmpl b/extensions/BMO/template/en/default/hook/admin/params/editparams-current_panel.html.tmpl index 61c287700a..1ce5374bc3 100644 --- a/extensions/BMO/template/en/default/hook/admin/params/editparams-current_panel.html.tmpl +++ b/extensions/BMO/template/en/default/hook/admin/params/editparams-current_panel.html.tmpl @@ -21,7 +21,7 @@ ELSIF panel.name == "bugchange"; ELSIF panel.name == "reports"; panel.param_descs.bmo_etl_enabled = 'Enable export to BMO ETL.'; panel.param_descs.bmo_etl_base_url = 'The base URL for sending BMO ETL data.'; - panel.param_descs.bmo_etl_service_account = 'THe Google service account for accessing the BMO ETL API.'; + panel.param_descs.bmo_etl_service_account = 'The Google service account for accessing the BMO ETL API.'; panel.param_descs.bmo_etl_project_id = 'The project ID for the BMO ETL data.'; panel.param_descs.bmo_etl_dataset_id = 'The dataset ID for the BMO ETL data.'; From da91d434a5d2b76aaef6843f983b217133168c85 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Fri, 17 Jan 2025 22:35:18 -0500 Subject: [PATCH 07/12] Fixed failing sanity test --- extensions/BMO/bin/export_bmo_etl.pl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/extensions/BMO/bin/export_bmo_etl.pl b/extensions/BMO/bin/export_bmo_etl.pl index 04ab89a62a..e2334d1454 100644 --- a/extensions/BMO/bin/export_bmo_etl.pl +++ b/extensions/BMO/bin/export_bmo_etl.pl @@ -188,16 +188,16 @@ sub process_bugs { $data->{cc_count} = scalar @{$obj->cc || []}; # If more than one group, then pick the one with the least of amount of members - if (!$bug_is_private) { + if (!$bug_is_private) { $data->{group} = undef; - } - elsif (scalar @{$obj->groups_in} == 1) { - my $groups = $obj->groups_in; - $data->{group} = $groups->[0]->name; - } - else { - $data->{group} = get_multi_group_value($obj); - } + } + elsif (scalar @{$obj->groups_in} == 1) { + my $groups = $obj->groups_in; + $data->{group} = $groups->[0]->name; + } + else { + $data->{group} = get_multi_group_value($obj); + } # Store a copy of the data for use in later executions store_cache($obj->id, $table_name, $obj->delta_ts, $data); From 5ba44bdea6d75603254414edb46a4bb50a2c20ff Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Wed, 22 Jan 2025 16:05:51 -0500 Subject: [PATCH 08/12] - Updated test script to use todays date instead of hardcoded date - Added test to make sure we also add flag entries for deletions - Fixed some typos in export script --- extensions/BMO/bin/export_bmo_etl.pl | 7 ++-- extensions/BMO/t/bmo/bmo_etl.t | 49 +++++++++++++++++++--------- 2 files changed, 37 insertions(+), 19 deletions(-) diff --git a/extensions/BMO/bin/export_bmo_etl.pl b/extensions/BMO/bin/export_bmo_etl.pl index e2334d1454..0e0807f6b5 100644 --- a/extensions/BMO/bin/export_bmo_etl.pl +++ b/extensions/BMO/bin/export_bmo_etl.pl @@ -18,6 +18,7 @@ use Bugzilla::Flag; use Bugzilla::Group; use Bugzilla::User; +use Bugzilla::Extension::Review::FlagStateActivity; use HTTP::Headers; use HTTP::Request; @@ -93,7 +94,7 @@ process_flags(); process_flag_state_activity(); process_tracking_flags(); -proccess_keywords(); +process_keywords(); process_see_also(); process_mentors(); process_dependencies(); @@ -382,7 +383,7 @@ sub process_flag_state_activity { }; # Store a new copy of the data for use later - store_cache($obj->id, $table_name, $obj->modification_date, $data); + store_cache($obj->id, $table_name, $obj->flag_when, $data); } push @flags, $data; @@ -441,7 +442,7 @@ sub process_tracking_flags { } } -sub proccess_keywords { +sub process_keywords { my $table_name = 'keywords'; my $rows = $dbh->selectall_arrayref( 'SELECT bug_id, keyworddefs.name AS name diff --git a/extensions/BMO/t/bmo/bmo_etl.t b/extensions/BMO/t/bmo/bmo_etl.t index fc040db9d7..e414b09cdf 100644 --- a/extensions/BMO/t/bmo/bmo_etl.t +++ b/extensions/BMO/t/bmo/bmo_etl.t @@ -19,9 +19,10 @@ BEGIN { } use Capture::Tiny qw(capture); -use QA::Util qw(get_config); -use MIME::Base64 qw(encode_base64 decode_base64); -use Mojo::JSON qw(false); +use DateTime; +use QA::Util qw(get_config); +use MIME::Base64 qw(encode_base64 decode_base64); +use Mojo::JSON qw(false); use Test::Mojo; use Test::More; @@ -30,6 +31,7 @@ my $admin_login = $config->{admin_user_login}; my $admin_api_key = $config->{admin_user_api_key}; my $editbugs_api_key = $config->{editbugs_user_api_key}; my $url = Bugzilla->localconfig->urlbase; +my $snapshot_date = DateTime->now()->strftime('%Y-%m-%d'); my $t = Test::Mojo->new(); @@ -60,9 +62,13 @@ $t->post_ok($url my $bug_id_1 = $t->tx->res->json->{id}; # Clear the needinfo which will add an X entry in flag_state_activity +my $needinfo_update = { + comment => {body => 'This is my comment'}, + flags => [{name => 'needinfo', status => 'X'}], +}; $t->put_ok($url . "rest/bug/$bug_id_1" => {'X-Bugzilla-API-Key' => $editbugs_api_key} => - json => {comment => {body => 'This is my comment'}})->status_is(200); + json => $needinfo_update)->status_is(200); ### Section 2: Create a new dependent bug @@ -114,19 +120,19 @@ my ($attach_id) = keys %{$t->tx->res->json->{attachments}}; my @cmd = ( './extensions/BMO/bin/export_bmo_etl.pl', - '--verbose', '--test', '--snapshot-date', '2000-01-01' + '--verbose', '--test', '--snapshot-date', $snapshot_date, ); my ($output, $error, $rv) = capture { system @cmd; }; ok(!$rv, 'Data exported to test files without error'); -ok(glob(bz_locations()->{'datadir'} . '/2000-01-01-bugs-*.json'), +ok(glob(bz_locations()->{'datadir'} . '/' . $snapshot_date . '-bugs-*.json'), 'Export test files exist'); ### Section 5: Export data to BigQuery test instance @cmd = ( - './extensions/BMO/bin/export_bmo_etl.pl', '--verbose', - '--snapshot-date', '2000-01-01' + './extensions/BMO/bin/export_bmo_etl.pl', + '--verbose', '--snapshot-date', $snapshot_date, ); ($output, $error, $rv) = capture { system @cmd; }; @@ -135,7 +141,10 @@ ok(!$rv, 'Data exported to BigQuery test instance without error'); ### Section 6: Retrieve data from BigQuery instance and verify my $query = { - query => 'SELECT summary FROM test.bugzilla.bugs WHERE id = ' . $bug_id_1 . ';', + query => 'SELECT summary FROM test.bugzilla.bugs WHERE id = ' + . $bug_id_1 + . ' AND snapshot_date = \'' + . $snapshot_date . '\';', useLegacySql => false }; $t->post_ok( @@ -144,7 +153,9 @@ $t->post_ok( $query = { query => 'SELECT description FROM test.bugzilla.attachments WHERE id = ' - . $attach_id . ';', + . $attach_id + . ' AND snapshot_date = \'' + . $snapshot_date . '\';', useLegacySql => false }; $t->post_ok( @@ -154,7 +165,9 @@ $t->post_ok( $query = { query => 'SELECT depends_on_id FROM test.bugzilla.bug_dependencies WHERE bug_id = ' - . $bug_id_2 . ';', + . $bug_id_2 + . ' AND snapshot_date = \'' + . $snapshot_date . '\';', useLegacySql => false }; $t->post_ok( @@ -163,7 +176,9 @@ $t->post_ok( $query = { query => 'SELECT bug_id FROM test.bugzilla.flags WHERE bug_id = ' - . $bug_id_1 . " AND name = 'needinfo';", + . $bug_id_1 + . ' AND name = \'needinfo\' AND value = \'X\' AND snapshot_date = \'' + . $snapshot_date . '\'', useLegacySql => false }; $t->post_ok( @@ -173,8 +188,10 @@ $t->post_ok( $query = { query => 'SELECT bug_id FROM test.bugzilla.flags WHERE bug_id = ' . $bug_id_1 - . " AND name = 'review' AND attachment_id = " - . $attach_id . ';', + . ' AND name = \'review\' AND attachment_id = ' + . $attach_id + . ' AND snapshot_date = \'' + . $snapshot_date . '\';', useLegacySql => false }; @@ -185,8 +202,8 @@ $t->post_ok( ### Section 7: Exporting again on the same day (with the same snapshot date) will cause the script to exit @cmd = ( - './extensions/BMO/bin/export_bmo_etl.pl', '--verbose', - '--snapshot-date', '2000-01-01', + './extensions/BMO/bin/export_bmo_etl.pl', + '--verbose', '--snapshot-date', $snapshot_date, ); ($output, $error, $rv) = capture { system @cmd; }; From 1d8ffe67987c72ed9facee5730eb95cf52353db2 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Fri, 24 Jan 2025 17:02:18 -0500 Subject: [PATCH 09/12] * Use LIMIT/OFFSET for all tables instead of just slurping the whole table into memory ** This is better for memory efficiency and container crashes * Created a single function for all two column tables --- extensions/BMO/bin/export_bmo_etl.pl | 474 ++++++++++++--------------- 1 file changed, 214 insertions(+), 260 deletions(-) diff --git a/extensions/BMO/bin/export_bmo_etl.pl b/extensions/BMO/bin/export_bmo_etl.pl index 0e0807f6b5..890bf369a7 100644 --- a/extensions/BMO/bin/export_bmo_etl.pl +++ b/extensions/BMO/bin/export_bmo_etl.pl @@ -96,34 +96,51 @@ process_tracking_flags(); process_keywords(); process_see_also(); -process_mentors(); -process_dependencies(); -process_regressions(); -process_duplicates(); process_users(); +process_two_columns( + 'bug_mentors', 'bug_mentors', + ['bug_id', 'user_id'], + ['bug_id', 'user_id'] +); +process_two_columns( + 'dependencies', 'bug_dependencies', + ['blocked', 'dependson'], + ['bug_id', 'depends_on_id'] +); +process_two_columns( + 'regressions', 'bug_regressions', + ['regresses', 'regressed_by'], + ['bug_id', 'regresses_id'] +); +process_two_columns( + 'duplicates', 'bug_duplicates', + ['dupe', 'dupe_of'], + ['bug_id', 'duplicate_of_id'] +); + # If we are done, remove the lock delete_lock(); ### Functions sub process_bugs { - my $table_name = 'bugs'; - my $count = 0; - my $last_id = 0; + my $table_name = 'bugs'; + my $count = 0; + my $last_offset = 0; my $total = $dbh->selectrow_array('SELECT COUNT(*) FROM bugs'); print "Processing $total $table_name.\n" if $verbose; my $sth = $dbh->prepare( - 'SELECT bug_id AS id, delta_ts AS modification_time FROM bugs WHERE bug_id > ? ORDER BY bug_id LIMIT ' - . API_BLOCK_COUNT); + 'SELECT bug_id AS id, delta_ts AS modification_time FROM bugs ORDER BY bug_id LIMIT ? OFFSET ?' + ); while ($count < $total) { my @bugs = (); - $sth->execute($last_id); + $sth->execute(API_BLOCK_COUNT, $last_offset); while (my ($id, $mod_time) = $sth->fetchrow_array()) { print "Processing id $id with mod_time of $mod_time.\n" if $verbose; @@ -207,31 +224,32 @@ sub process_bugs { push @bugs, $data; $count++; - $last_id = $id; } + $last_offset += API_BLOCK_COUNT; + # Send the rows to the server send_data($table_name, \@bugs, $count) if @bugs; } } sub process_attachments { - my $table_name = 'attachments'; - my $count = 0; - my $last_id = 0; + my $table_name = 'attachments'; + my $count = 0; + my $last_offset = 0; my $total = $dbh->selectrow_array('SELECT COUNT(*) FROM attachments'); print "Processing $total $table_name.\n" if $verbose; my $sth = $dbh->prepare( - 'SELECT attach_id AS id, modification_time FROM attachments WHERE attach_id > ? ORDER BY attach_id LIMIT ' - . API_BLOCK_COUNT); + 'SELECT attach_id, modification_time FROM attachments ORDER BY attach_id LIMIT ? OFFSET ?' + ); while ($count < $total) { - my @attachments = (); + my @results = (); - $sth->execute($last_id); + $sth->execute(API_BLOCK_COUNT, $last_offset); while (my ($id, $mod_time) = $sth->fetchrow_array()) { print "Processing id $id with mod_time of $mod_time.\n" if $verbose; @@ -266,34 +284,33 @@ sub process_attachments { store_cache($obj->id, $table_name, $obj->modification_time, $data); } - push @attachments, $data; + push @results, $data; $count++; - $last_id = $id; } + $last_offset += API_BLOCK_COUNT; + # Send the rows to the server - send_data($table_name, \@attachments, $count) if @attachments; + send_data($table_name, \@results, $count) if @results; } } sub process_flags { - my $table_name = 'flags'; - my $count = 0; - my $last_id = 0; + my $table_name = 'flags'; + my $count = 0; + my $last_offset = 0; my $total = $dbh->selectrow_array('SELECT COUNT(*) FROM flags'); print "Processing $total $table_name.\n" if $verbose; - my $sth - = $dbh->prepare( - 'SELECT id, modification_date FROM flags WHERE id > ? ORDER BY id LIMIT ' - . API_BLOCK_COUNT); + my $sth = $dbh->prepare( + 'SELECT id, modification_date FROM flags ORDER BY id LIMIT ? OFFSET ?'); while ($count < $total) { - my @flags = (); + my @results = (); - $sth->execute($last_id); + $sth->execute(API_BLOCK_COUNT, $last_offset); while (my ($id, $mod_time) = $sth->fetchrow_array()) { print "Processing id $id with mod_time of $mod_time.\n" if $verbose; @@ -323,40 +340,43 @@ sub process_flags { store_cache($obj->id, $table_name, $obj->modification_date, $data); } - push @flags, $data; + push @results, $data; $count++; - $last_id = $id; } + $last_offset += API_BLOCK_COUNT; + # Send the rows to the server - send_data($table_name, \@flags, $count) if @flags; + send_data($table_name, \@results, $count) if @results; } } sub process_flag_state_activity { + # Process flags that were removed today using the flag_state_activity table # These entries will also go into the flags table in BigQuery. - my $table_name = 'flag_state_activity'; - my $count = 0; - my $last_id = 0; + my $table_name = 'flag_state_activity'; + my $count = 0; + my $last_offset = 0; my $total = $dbh->selectrow_array( 'SELECT COUNT(*) FROM flag_state_activity WHERE status = \'X\' AND flag_when LIKE \'' - . $snapshot_date . ' %\''); + . $snapshot_date + . ' %\''); print "Processing $total $table_name.\n" if $verbose; my $sth = $dbh->prepare( - 'SELECT id, flag_when FROM flag_state_activity WHERE id > ? AND status = \'X\' AND flag_when LIKE \'' - . $snapshot_date . ' %\' ORDER BY id LIMIT ' - . API_BLOCK_COUNT); + 'SELECT id, flag_when FROM flag_state_activity WHERE status = \'X\' AND flag_when LIKE \'' + . $snapshot_date + . ' %\' ORDER BY id LIMIT ? OFFSET ?'); while ($count < $total) { - my @flags = (); + my @results = (); - $sth->execute($last_id); + $sth->execute(API_BLOCK_COUNT, $last_offset); while (my ($id, $mod_time) = $sth->fetchrow_array()) { print "Processing id $id with mod_time of $mod_time.\n" if $verbose; @@ -386,284 +406,181 @@ sub process_flag_state_activity { store_cache($obj->id, $table_name, $obj->flag_when, $data); } - push @flags, $data; + push @results, $data; $count++; - $last_id = $id; } + $last_offset += API_BLOCK_COUNT; + # Send the rows to the server - send_data('flags', \@flags, $count) if @flags; + send_data('flags', \@results, $count) if @results; } } sub process_tracking_flags { - my $table_name = 'tracking_flags'; - my $rows = $dbh->selectall_arrayref( - 'SELECT tracking_flags.name AS name, tracking_flags_bugs.bug_id AS bug_id, tracking_flags_bugs.value AS value - FROM tracking_flags_bugs + my $table_name = 'tracking_flags'; + my $count = 0; + my $last_offset = 0; + + my $total = $dbh->selectrow_array( + 'SELECT COUNT(*) + FROM tracking_flags_bugs JOIN tracking_flags ON tracking_flags_bugs.tracking_flag_id = tracking_flags.id - ORDER BY tracking_flags_bugs.bug_id', {Slice => {}} + ORDER BY tracking_flags_bugs.bug_id' ); - - my $total = scalar @{$rows}; - my $count = 0; - print "Processing $total $table_name.\n" if $verbose; - my @results = (); - foreach my $row (@{$rows}) { - next if $excluded_bugs{$row->{bug_id}}; - - # Standard fields - my $data = {bug_id => $row->{bug_id}}; - - # Fields that require custom values based on other criteria - if (exists $private_bugs{$row->{bug_id}}) { - $data->{name} = undef; - $data->{value} = undef; - } - else { - $data->{name} = $row->{name}; - $data->{value} = $row->{value}; - } - - push @results, $data; - - $count++; - - # Send the rows to the server if we have a specific sized block' - # or we are at the last row - if (scalar @results == API_BLOCK_COUNT || $total == $count) { - send_data($table_name, \@results, $count); - @results = (); - } - } -} - -sub process_keywords { - my $table_name = 'keywords'; - my $rows = $dbh->selectall_arrayref( - 'SELECT bug_id, keyworddefs.name AS name - FROM keywords - JOIN keyworddefs - ON keywords.keywordid = keyworddefs.id - ORDER BY bug_id', {Slice => {}} + my $sth = $dbh->prepare( + 'SELECT tracking_flags.name, tracking_flags_bugs.bug_id, tracking_flags_bugs.value + FROM tracking_flags_bugs + JOIN tracking_flags + ON tracking_flags_bugs.tracking_flag_id = tracking_flags.id + ORDER BY tracking_flags_bugs.id LIMIT ? OFFSET ?' ); - my $total = scalar @{$rows}; - my $count = 0; - - print "Processing $total $table_name.\n" if $verbose; - - my @results = (); - foreach my $row (@{$rows}) { - next if $excluded_bugs{$row->{bug_id}}; - - # Standard fields - my $data = {bug_id => $row->{bug_id}}; - - # Fields that require custom values based on other criteria - $data->{keyword} = !exists $private_bugs{$row->{bug_id}} ? $row->{name} : undef; - - push @results, $data; - - $count++; - - # Send the rows to the server if we have a specific sized block' - # or we are at the last row - if (scalar @results == API_BLOCK_COUNT || $total == $count) { - send_data($table_name, \@results, $count); - @results = (); - } - } -} - -sub process_see_also { - my $table_name = 'see_also'; - my $rows - = $dbh->selectall_arrayref( - 'SELECT bug_id, value, class FROM bug_see_also ORDER BY bug_id', - {Slice => {}}); - - my $total = scalar @{$rows}; - my $count = 0; + while ($count < $total) { + my @results = (); - print "Processing $total $table_name.\n" if $verbose; + $sth->execute(API_BLOCK_COUNT, $last_offset); - my @results = (); - foreach my $row (@{$rows}) { - next if $excluded_bugs{$row->{bug_id}}; + while (my ($name, $bug_id, $value) = $sth->fetchrow_array()) { + next if $excluded_bugs{$bug_id}; - # Standard fields - my $data = {bug_id => $row->{bug_id},}; + # Standard fields + my $data = {bug_id => $bug_id}; - # Fields that require custom values based on other criteria - if ($private_bugs{$row->{bug_id}}) { - $data->{url} = undef; - } - else { - if ($row->{class} =~ /::Local/) { - $data->{url} - = Bugzilla->localconfig->urlbase . 'show_bug.cgi?id=' . $row->{value}; + # Fields that require custom values based on other criteria + if (exists $private_bugs{$bug_id}) { + $data->{name} = undef; + $data->{value} = undef; } else { - $data->{url} = $row->{value}; + $data->{name} = $name; + $data->{value} = $value; } - } - - push @results, $data; - $count++; + push @results, $data; - # Send the rows to the server if we have a specific sized block' - # or we are at the last row - if (scalar @results == API_BLOCK_COUNT || $total == $count) { - send_data($table_name, \@results, $count); - @results = (); + $count++; } + + $last_offset += API_BLOCK_COUNT; + + # Send the rows to the server + send_data($table_name, \@results, $count) if @results; } } -sub process_mentors { - my $table_name = 'bug_mentors'; - my $rows - = $dbh->selectall_arrayref( - 'SELECT bug_id, user_id FROM bug_mentors ORDER BY bug_id', - {Slice => {}}); - - my $total = scalar @{$rows}; - my $count = 0; +sub process_keywords { + my $table_name = 'keywords'; + my $count = 0; + my $last_offset = 0; + my $total = $dbh->selectrow_array('SELECT COUNT(*) FROM keywords'); print "Processing $total $table_name.\n" if $verbose; - my @results = (); - foreach my $row (@{$rows}) { - next if $excluded_bugs{$row->{bug_id}}; - - my $data = {bug_id => $row->{bug_id}, user_id => $row->{user_id}}; - - push @results, $data; - - $count++; + my $sth = $dbh->prepare( + 'SELECT bug_id, keyworddefs.name + FROM keywords + JOIN keyworddefs + ON keywords.keywordid = keyworddefs.id + ORDER BY bug_id LIMIT ? OFFSET ?' + ); - # Send the rows to the server if we have a specific sized block' - # or we are at the last row - if (scalar @results == API_BLOCK_COUNT || $total == $count) { - send_data($table_name, \@results, $count); - @results = (); - } - } -} + while ($count < $total) { + my @results = (); -sub process_dependencies { - my $table_name = 'bug_dependencies'; - my $rows - = $dbh->selectall_arrayref( - 'SELECT blocked, dependson FROM dependencies ORDER BY blocked', - {Slice => {}}); + $sth->execute(API_BLOCK_COUNT, $last_offset); - my $total = scalar @{$rows}; - my $count = 0; + while (my ($bug_id, $keyword) = $sth->fetchrow_array()) { + next if $excluded_bugs{$bug_id}; - print "Processing $total $table_name.\n" if $verbose; + # Standard fields + my $data = {bug_id => $bug_id}; - my @results = (); - foreach my $row (@{$rows}) { - next if $excluded_bugs{$row->{blocked}}; + # Fields that require custom values based on other criteria + $data->{keyword} = !exists $private_bugs{$bug_id} ? $keyword : undef; - my $data = {bug_id => $row->{blocked}, depends_on_id => $row->{dependson}}; + push @results, $data; - push @results, $data; + $count++; + } - $count++; + $last_offset += API_BLOCK_COUNT; - # Send the rows to the server if we have a specific sized block' - # or we are at the last row - if (scalar @results == API_BLOCK_COUNT || $total == $count) { - send_data($table_name, \@results, $count); - @results = (); - } + # Send the rows to the server + send_data($table_name, \@results, $count) if @results; } } -sub process_regressions { - my $table_name = 'bug_regressions'; - my $rows - = $dbh->selectall_arrayref('SELECT regresses, regressed_by FROM regressions', - {Slice => {}}); - - my $total = scalar @{$rows}; - my $count = 0; +sub process_see_also { + my $table_name = 'bug_see_also'; + my $count = 0; + my $last_offset = 0; + my $total = $dbh->selectrow_array('SELECT COUNT(*) FROM bug_see_also'); print "Processing $total $table_name.\n" if $verbose; - my @results = (); - foreach my $row (@{$rows}) { - next if $excluded_bugs{$row->{regresses}}; - - my $data = {bug_id => $row->{regresses}, regresses_id => $row->{regressed_by},}; - - push @results, $data; - - $count++; + my $sth + = $dbh->prepare( + 'SELECT bug_id, value, class FROM bug_see_also ORDER BY bug_id LIMIT ? OFFSET ?' + ); - # Send the rows to the server if we have a specific sized block - # or we are at the last row - if (scalar @results == API_BLOCK_COUNT || $total == $count) { - send_data($table_name, \@results, $count); - @results = (); - } - } -} + while ($count < $total) { + my @results = (); -sub process_duplicates { - my $table_name = 'bug_duplicates'; - my $rows = $dbh->selectall_arrayref('SELECT dupe, dupe_of FROM duplicates', - {Slice => {}}); + $sth->execute(API_BLOCK_COUNT, $last_offset); - my $total = scalar @{$rows}; - my $count = 0; + while (my ($bug_id, $value, $class) = $sth->fetchrow_array()) { + next if $excluded_bugs{$bug_id}; - print "Processing $total $table_name.\n" if $verbose; + # Standard fields + my $data = {bug_id => $bug_id,}; - my @results = (); - foreach my $row (@{$rows}) { - next if $excluded_bugs{$row->{dupe}}; + # Fields that require custom values based on other criteria + if ($private_bugs{$bug_id}) { + $data->{url} = undef; + } + else { + if ($class =~ /::Local/) { + $data->{url} = Bugzilla->localconfig->urlbase . 'show_bug.cgi?id=' . $value; + } + else { + $data->{url} = $value; + } + } - my $data = {bug_id => $row->{dupe}, duplicate_of_id => $row->{dupe_of},}; + push @results, $data; - push @results, $data; + $count++; + } - $count++; + $last_offset += API_BLOCK_COUNT; - # Send the rows to the server if we have a specific sized block' - # or we are at the last row - if (scalar @results == API_BLOCK_COUNT || $total == $count) { - send_data($table_name, \@results, $count); - @results = (); - } + # Send the rows to the server + send_data($table_name, \@results, $count) if @results; } } sub process_users { - my $table_name = 'users'; - my $count = 0; - my $last_id = 0; + my $table_name = 'users'; + my $count = 0; + my $last_offset = 0; my $total = $dbh->selectrow_array('SELECT COUNT(*) FROM profiles'); print "Processing $total $table_name.\n" if $verbose; my $sth = $dbh->prepare( - 'SELECT userid, modification_ts FROM profiles WHERE userid > ? ORDER BY userid LIMIT ' - . API_BLOCK_COUNT); + 'SELECT userid, modification_ts FROM profiles ORDER BY userid LIMIT ? OFFSET ?' + ); while ($count < $total) { my @users = (); - $sth->execute($last_id); + $sth->execute(API_BLOCK_COUNT, $last_offset); while (my ($id, $mod_time) = $sth->fetchrow_array()) { print "Processing id $id with mod_time of $mod_time.\n" if $verbose; @@ -699,14 +616,51 @@ sub process_users { push @users, $data; $count++; - $last_id = $id; } + $last_offset += API_BLOCK_COUNT; + # Send the rows to the server send_data($table_name, \@users, $count) if @users; } } +sub process_two_columns { + my ($table_name, $bq_name, $column_names, $data_names) = @_; + my $count = 0; + my $last_offset = 0; + + my $total = $dbh->selectrow_array('SELECT COUNT(*) FROM ' . $table_name); + print "Processing $total $table_name.\n" if $verbose; + + my $columns_string = join ', ', @{$column_names}; + my $order_by = $column_names->[0]; + + my $sth = $dbh->prepare( + "SELECT $columns_string FROM $table_name ORDER BY $order_by LIMIT ? OFFSET ?"); + + while ($count < $total) { + my @results = (); + + $sth->execute(API_BLOCK_COUNT, $last_offset); + + while (my ($value1, $value2) = $sth->fetchrow_array()) { + next if $excluded_bugs{$value1}; + + my $data = {$data_names->[0] => $value1, $data_names->[1] => $value2,}; + + push @results, $data; + + $count++; + } + + $last_offset += API_BLOCK_COUNT; + + # Send the rows to the server + send_data($bq_name, \@results, $count) if @results; + } +} + sub get_cache { my ($id, $table, $timestamp) = @_; @@ -937,15 +891,15 @@ sub get_multi_group_value { my $smallest_group_count = 0; foreach my $group (@{$bug->groups_in}) { - my $user_count = 0; - my $member_data = $group->members_complete; - foreach my $type (keys %{$member_data}) { - $user_count += scalar @{$member_data->{$type}}; - } - if ($user_count < $smallest_group_count) { - $smallest_group_count = $user_count; - $smallest_group_name = $group->name; - } + my $user_count = 0; + my $member_data = $group->members_complete; + foreach my $type (keys %{$member_data}) { + $user_count += scalar @{$member_data->{$type}}; + } + if ($user_count < $smallest_group_count) { + $smallest_group_count = $user_count; + $smallest_group_name = $group->name; + } } return $smallest_group_name; From 1243b675799c9c9aed972cf50681abd355420eaa Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Fri, 24 Jan 2025 22:55:10 -0500 Subject: [PATCH 10/12] Change bug_see_also to see_also to match proper BigQuery table name --- extensions/BMO/bin/export_bmo_etl.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extensions/BMO/bin/export_bmo_etl.pl b/extensions/BMO/bin/export_bmo_etl.pl index 890bf369a7..c29f30dd60 100644 --- a/extensions/BMO/bin/export_bmo_etl.pl +++ b/extensions/BMO/bin/export_bmo_etl.pl @@ -516,7 +516,7 @@ sub process_keywords { } sub process_see_also { - my $table_name = 'bug_see_also'; + my $table_name = 'see_also'; my $count = 0; my $last_offset = 0; From 7bbc222615d67bf43d425dc51ed4bb8354083f72 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Tue, 28 Jan 2025 09:33:17 -0500 Subject: [PATCH 11/12] - Convert last_seen_date from date to a timestamp for BigQuery - Proper boolean value for is_new instead of 0 or 1 --- extensions/BMO/bin/export_bmo_etl.pl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/extensions/BMO/bin/export_bmo_etl.pl b/extensions/BMO/bin/export_bmo_etl.pl index c29f30dd60..a165b8fd14 100644 --- a/extensions/BMO/bin/export_bmo_etl.pl +++ b/extensions/BMO/bin/export_bmo_etl.pl @@ -596,9 +596,9 @@ sub process_users { # Standard fields $data = { id => $obj->id, - last_seen => $obj->last_seen_date, + last_seen => ($obj->last_seen_date ? $obj->last_seen_date . ' 00:00:00' : undef), email => $obj->email, - is_new => $obj->is_new, + is_new => ($obj->is_new ? true : false), }; # Fields that require custom values based on criteria @@ -647,6 +647,8 @@ sub process_two_columns { while (my ($value1, $value2) = $sth->fetchrow_array()) { next if $excluded_bugs{$value1}; + print "Processing values $value1, $value2 for $table_name.\n" if $verbose; + my $data = {$data_names->[0] => $value1, $data_names->[1] => $value2,}; push @results, $data; From b9c901f8ffd2ad0679b965a3556e1c93f4b892e5 Mon Sep 17 00:00:00 2001 From: David Lawrence Date: Wed, 29 Jan 2025 17:42:21 -0500 Subject: [PATCH 12/12] Small changes based on review --- extensions/BMO/bin/export_bmo_etl.pl | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/extensions/BMO/bin/export_bmo_etl.pl b/extensions/BMO/bin/export_bmo_etl.pl index a165b8fd14..9d41ef3219 100644 --- a/extensions/BMO/bin/export_bmo_etl.pl +++ b/extensions/BMO/bin/export_bmo_etl.pl @@ -84,7 +84,7 @@ # Bugs that are private to one or more groups our %private_bugs = (); -# I order to avoid entering duplicate data, we will first query BigQuery +# In order to avoid entering duplicate data, we will first query BigQuery # to make sure other entries with this date are not already present. check_for_duplicates(); @@ -543,13 +543,11 @@ sub process_see_also { if ($private_bugs{$bug_id}) { $data->{url} = undef; } + elsif ($class =~ /::Local/) { + $data->{url} = Bugzilla->localconfig->urlbase . 'show_bug.cgi?id=' . $value; + } else { - if ($class =~ /::Local/) { - $data->{url} = Bugzilla->localconfig->urlbase . 'show_bug.cgi?id=' . $value; - } - else { - $data->{url} = $value; - } + $data->{url} = $value; } push @results, $data;