Created
August 30, 2019 16:29
-
-
Save sunahsuh/40d250b542890e17967a38cb283d39ba to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python3 | |
| from google.cloud import bigquery | |
| import re | |
| DATASET = 'telemetry' | |
| TABLE_IDS = [ | |
| 'active_profiles', | |
| 'active_profiles_v1', | |
| 'addon_install_blocked', | |
| 'addons', | |
| 'addons_aggregates', | |
| 'addons_aggregates_v2', | |
| 'addons_v2', | |
| 'advancedtelemetry', | |
| 'android_anr_report', | |
| 'anonymous', | |
| 'bhr', | |
| 'block_autoplay', | |
| 'certificate_checker', | |
| 'churn', | |
| 'churn_v2', | |
| 'churn_v3', | |
| 'client_probe_counts_v1', | |
| 'client_probe_counts_v2', | |
| 'clients_aggregates_v1', | |
| 'clients_aggregates_v2', | |
| 'clients_daily', | |
| 'clients_daily_histogram_aggregates_v1', | |
| 'clients_daily_histogram_aggregates_v2', | |
| 'clients_daily_keyed_scalar_aggregates_v1', | |
| 'clients_daily_scalar_aggregates_v1', | |
| 'clients_daily_v6', | |
| 'clients_last_seen', | |
| 'clients_last_seen_raw_v1', | |
| 'clients_last_seen_v1', | |
| 'clients_profile_per_install_affected', | |
| 'core', | |
| 'core_clients_daily', | |
| 'core_clients_daily_v1', | |
| 'core_clients_last_seen', | |
| 'core_clients_last_seen_raw_v1', | |
| 'core_clients_last_seen_v1', | |
| 'crash', | |
| 'crash_aggregates', | |
| 'crash_aggregates_v1', | |
| 'crash_summary', | |
| 'crash_summary_v1', | |
| 'crash_summary_v2', | |
| 'deletion', | |
| 'deployment_checker', | |
| 'disable_sha1rollout', | |
| 'downgrade', | |
| 'eng_workflow_build_parquet', | |
| 'eng_workflow_build_parquet_v1', | |
| 'eng_workflow_hgpush_parquet', | |
| 'eng_workflow_hgpush_parquet_v1', | |
| 'event', | |
| 'events', | |
| 'events_v1', | |
| 'experiment_error_aggregates', | |
| 'experiment_error_aggregates_v1', | |
| 'experiments', | |
| 'experiments_aggregates', | |
| 'experiments_aggregates_v1', | |
| 'experiments_v1', | |
| 'fenix_events_amplitude_v1', | |
| 'fenix_events_v1', | |
| 'firefox_accounts_exact_mau28_by_dimensions', | |
| 'firefox_accounts_exact_mau28_by_dimensions_v1', | |
| 'firefox_accounts_exact_mau28_raw_v1', | |
| 'firefox_accounts_exact_mau28_v1', | |
| 'firefox_desktop_exact_mau28', | |
| 'firefox_desktop_exact_mau28_by_dimensions', | |
| 'firefox_desktop_exact_mau28_by_dimensions_v1', | |
| 'firefox_desktop_exact_mau28_v1', | |
| 'firefox_kpi_dashboard', | |
| 'firefox_kpi_dashboard_v1', | |
| 'firefox_nondesktop_exact_mau28', | |
| 'firefox_nondesktop_exact_mau28_by_dimensions', | |
| 'firefox_nondesktop_exact_mau28_by_dimensions_v1', | |
| 'firefox_nondesktop_exact_mau28_by_product', | |
| 'firefox_nondesktop_exact_mau28_by_product_v1', | |
| 'firefox_nondesktop_exact_mau28_raw_v1', | |
| 'firefox_nondesktop_exact_mau28_v1', | |
| 'first_shutdown', | |
| 'first_shutdown_summary', | |
| 'first_shutdown_summary_v4', | |
| 'flash_shield_study', | |
| 'focus_event', | |
| 'frecency_update', | |
| 'ftu', | |
| 'fxa_all_events', | |
| 'fxa_all_events_v1', | |
| 'fxa_auth_bounce_events', | |
| 'fxa_auth_bounce_events_v1', | |
| 'fxa_auth_events', | |
| 'fxa_auth_events_v1', | |
| 'fxa_content_auth_events', | |
| 'fxa_content_auth_events_v1', | |
| 'fxa_content_events', | |
| 'fxa_content_events_v1', | |
| 'fxa_users_daily', | |
| 'fxa_users_daily_v1', | |
| 'fxa_users_last_seen', | |
| 'fxa_users_last_seen_raw_v1', | |
| 'fxa_users_last_seen_v1', | |
| 'fxa_users_last_seen_v1_outgoing', | |
| 'glean_clients_daily_v1', | |
| 'glean_clients_last_seen_raw_v1', | |
| 'glean_clients_last_seen_v1', | |
| 'health', | |
| 'heartbeat', | |
| 'lockwise_mobile_events', | |
| 'lockwise_mobile_events_v1', | |
| 'main', | |
| 'main_summary', | |
| 'main_summary_v3', | |
| 'main_summary_v4', | |
| 'malware_addon_states', | |
| 'mobile_event', | |
| 'mobile_metrics', | |
| 'modules', | |
| 'new_profile', | |
| 'nondesktop_clients_last_seen_v1', | |
| 'optout', | |
| 'outofdate_notifications_system_addon', | |
| 'pioneer_study', | |
| 'pre_account', | |
| 'prio', | |
| 'remote_content_uptake', | |
| 'remote_content_uptake_v1', | |
| 'retention', | |
| 'retention_v1', | |
| 'saved_session', | |
| 'searchvol', | |
| 'searchvolextra', | |
| 'shield_icq_v1', | |
| 'shield_study', | |
| 'shield_study_addon', | |
| 'shield_study_error', | |
| 'smoot_all_usage', | |
| 'smoot_all_usage_v1', | |
| 'smoot_clients_daily_1percent', | |
| 'smoot_clients_daily_1percent_v1', | |
| 'smoot_clients_last_seen_1percent', | |
| 'smoot_clients_last_seen_1percent_cleaned', | |
| 'smoot_clients_last_seen_1percent_cleaned_v1', | |
| 'smoot_clients_last_seen_1percent_raw_v1', | |
| 'smoot_clients_last_seen_1percent_v1', | |
| 'smoot_desktop_0week', | |
| 'smoot_desktop_0week_v1', | |
| 'smoot_desktop_usage', | |
| 'smoot_desktop_usage_raw_v1', | |
| 'smoot_desktop_usage_v1', | |
| 'smoot_metrics', | |
| 'smoot_metrics_v1', | |
| 'smoot_nondesktop_usage', | |
| 'smoot_nondesktop_usage_raw_v1', | |
| 'smoot_nondesktop_usage_v1', | |
| 'smoot_usage', | |
| 'smoot_usage_1week_raw_v1', | |
| 'smoot_usage_2week_raw_v1', | |
| 'smoot_usage_all', | |
| 'smoot_usage_all_mtr_v1', | |
| 'smoot_usage_all_v1', | |
| 'smoot_usage_desktop', | |
| 'smoot_usage_desktop_raw_v1', | |
| 'smoot_usage_desktop_v1', | |
| 'smoot_usage_fxa_raw_v1', | |
| 'smoot_usage_fxa_v1', | |
| 'smoot_usage_metrics_raw_v1', | |
| 'smoot_usage_nondesktop', | |
| 'smoot_usage_nondesktop_raw_v1', | |
| 'smoot_usage_nondesktop_v1', | |
| 'smoot_usage_v1', | |
| 'socorro_crash', | |
| 'socorro_crash_v2', | |
| 'ssl_ratios', | |
| 'ssl_ratios_v1', | |
| 'sync', | |
| 'sync_events', | |
| 'sync_events_v1', | |
| 'sync_flat_summary', | |
| 'sync_flat_summary_v1', | |
| 'sync_log', | |
| 'sync_log_device_activity', | |
| 'sync_log_device_activity_v1', | |
| 'sync_log_device_counts', | |
| 'sync_log_device_counts_v1', | |
| 'sync_log_v1', | |
| 'sync_summary', | |
| 'sync_summary_v2', | |
| 'system_addon_deployment_diagnostics', | |
| 'telemetry_anonymous_parquet', | |
| 'telemetry_anonymous_parquet_v1', | |
| 'telemetry_core_parquet', | |
| 'telemetry_core_parquet_v3', | |
| 'telemetry_downgrade_parquet', | |
| 'telemetry_downgrade_parquet_v1', | |
| 'telemetry_focus_event_parquet', | |
| 'telemetry_focus_event_parquet_v1', | |
| 'telemetry_heartbeat_parquet', | |
| 'telemetry_heartbeat_parquet_v1', | |
| 'telemetry_ip_privacy_parquet', | |
| 'telemetry_ip_privacy_parquet_v1', | |
| 'telemetry_mobile_event_parquet', | |
| 'telemetry_mobile_event_parquet_v2', | |
| 'telemetry_new_profile_parquet', | |
| 'telemetry_new_profile_parquet_v2', | |
| 'telemetry_shield_study_parquet', | |
| 'telemetry_shield_study_parquet_v1', | |
| 'testpilot', | |
| 'testpilottest', | |
| 'tls13_middlebox_alt_server_hello_1', | |
| 'tls13_middlebox_beta', | |
| 'tls13_middlebox_draft22', | |
| 'tls13_middlebox_ghack', | |
| 'tls13_middlebox_repetition', | |
| 'tls13_middlebox_testing', | |
| 'tls_13_study', | |
| 'tls_13_study_v1', | |
| 'tls_13_study_v2', | |
| 'tls_13_study_v3', | |
| 'tls_13_study_v4', | |
| 'uitour_tag', | |
| 'untrusted_modules', | |
| 'update', | |
| 'x_contextual_feature_recommendation' | |
| ] | |
| UDF_DATASET = "udf_legacy" | |
| UDFS = [ | |
| "contains", | |
| "date_format", | |
| "to_iso8601" | |
| ] | |
| substitutions = [ | |
| # Add dataset to table names | |
| (f"(^|\\s+)({'|'.join(TABLE_IDS)})\\b", f"\\1{DATASET}.\\2"), | |
| # MMMMYYDD to MMMM-YY-DD | |
| (r"20(\d\d)(\d\d)(\d\d)", r"20\1-\2-\3"), | |
| # submission_date_s3 => submission_date | |
| (r"submission_date_s3", r"submission_date"), | |
| # sample_id string to int | |
| (r"sample_id ([<=>]+) '(\d+)'", r"sample_id \1 \2"), | |
| # remove ARRAY from array declarations | |
| (r"(?i)ARRAY\s*\[", r"["), | |
| # DOUBLE or FLOAT -> FLOAT64 | |
| (r"(?i)AS (DOUBLE|FLOAT)", "AS FLOAT64"), | |
| # INTEGER -> INT64 | |
| (r"(?i)AS (INT|INTEGER|BIGINT|SMALLINT|TINYINT)\b", "AS INT64"), | |
| # VARCHAR or CHAR -> STRING | |
| (r"(?i)AS (VARCHAR|CHAR)", "AS STRING"), | |
| # Common date parsing no longer necessary (note: keeping this after | |
| # the submission_date_s3 sub means we catch more cases since some | |
| # tables use submission_date as the date field | |
| (r"(?i)DATE_PARSE\(submission_date, '%Y%m%d'\)", "submission_date"), | |
| # Map Access -> get_key | |
| (r"([\w\.]+)\[('[\w]+?')\]", r"udf.get_key(\1, \2)"), | |
| # approx_distinct -> approx_count_distinct | |
| (r"(?i)APPROX_DISTINCT\(", "APPROX_COUNT_DISTINCT("), | |
| # TRY_CAST -> SAFE_CAST | |
| (r"(?i)TRY_CAST\(", "SAFE_CAST("), | |
| # element_at -> udf.get_key | |
| (r"(?i)ELEMENT_AT\(", "udf.get_key("), | |
| # use UDFs for some built-in functions | |
| (f"(?i)\\b({'|'.join(UDFS)})\(", r"{UDF_DATASET}.\1("), | |
| ] | |
| def translate(text): | |
| client = bigquery.Client(project='moz-fx-data-derived-datasets') | |
| subs = [(re.compile(r[0]), r[1]) for r in substitutions] | |
| for pattern, replacement in subs: | |
| text = re.sub(pattern, replacement, text) | |
| return text | |
| with open('/Users/ssuh/dev/mozilla/scratch/queries.txt', 'r') as f: | |
| q = f.read() | |
| print(translate(q)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment