airflow-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From fo...@apache.org
Subject incubator-airflow git commit: [AIRFLOW-1882] Add ignoreUnknownValues option to gcs_to_bq operator
Date Fri, 16 Feb 2018 11:36:48 GMT
Repository: incubator-airflow
Updated Branches:
  refs/heads/master 074295129 -> c739adc62


[AIRFLOW-1882] Add ignoreUnknownValues option to gcs_to_bq operator

- Added `ignore_unknown_values` to
`run_load` method in `BigQuery Hook`
- Added `ignore_unknown_values` to
`GoogleCloudStorageToBigQueryOperator`

Closes #3042 from kaxil/AIRFLOW-1882


Project: http://git-wip-us.apache.org/repos/asf/incubator-airflow/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-airflow/commit/c739adc6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-airflow/tree/c739adc6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-airflow/diff/c739adc6

Branch: refs/heads/master
Commit: c739adc623818287e8e7de7017aa3a2af085912e
Parents: 0742951
Author: Kaxil Naik <kaxilnaik@gmail.com>
Authored: Fri Feb 16 12:36:42 2018 +0100
Committer: Fokko Driesprong <fokkodriesprong@godatadriven.com>
Committed: Fri Feb 16 12:36:42 2018 +0100

----------------------------------------------------------------------
 airflow/contrib/hooks/bigquery_hook.py | 10 ++++++++++
 airflow/contrib/operators/gcs_to_bq.py | 10 ++++++++++
 2 files changed, 20 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-airflow/blob/c739adc6/airflow/contrib/hooks/bigquery_hook.py
----------------------------------------------------------------------
diff --git a/airflow/contrib/hooks/bigquery_hook.py b/airflow/contrib/hooks/bigquery_hook.py
index ce7e2c3..cd0318f 100644
--- a/airflow/contrib/hooks/bigquery_hook.py
+++ b/airflow/contrib/hooks/bigquery_hook.py
@@ -733,6 +733,7 @@ class BigQueryBaseCursor(LoggingMixin):
                  field_delimiter=',',
                  max_bad_records=0,
                  quote_character=None,
+                 ignore_unknown_values=False,
                  allow_quoted_newlines=False,
                  allow_jagged_rows=False,
                  schema_update_options=(),
@@ -776,6 +777,12 @@ class BigQueryBaseCursor(LoggingMixin):
         :param quote_character: The value that is used to quote data sections in a CSV
             file.
         :type quote_character: string
+        :param ignore_unknown_values: [Optional] Indicates if BigQuery should allow
+            extra values that are not represented in the table schema.
+            If true, the extra values are ignored. If false, records with extra columns
+            are treated as bad records, and if there are too many bad records, an
+            invalid error is returned in the job result.
+        :type ignore_unknown_values: bool
         :param allow_quoted_newlines: Whether to allow quoted newlines (true) or not
             (false).
         :type allow_quoted_newlines: boolean
@@ -842,6 +849,7 @@ class BigQueryBaseCursor(LoggingMixin):
                 'sourceFormat': source_format,
                 'sourceUris': source_uris,
                 'writeDisposition': write_disposition,
+                'ignoreUnknownValues': ignore_unknown_values
             }
         }
 
@@ -885,6 +893,8 @@ class BigQueryBaseCursor(LoggingMixin):
             src_fmt_configs['skipLeadingRows'] = skip_leading_rows
         if 'fieldDelimiter' not in src_fmt_configs:
             src_fmt_configs['fieldDelimiter'] = field_delimiter
+        if 'ignoreUnknownValues' not in src_fmt_configs:
+            src_fmt_configs['ignoreUnknownValues'] = ignore_unknown_values
         if quote_character is not None:
             src_fmt_configs['quote'] = quote_character
         if allow_quoted_newlines:

http://git-wip-us.apache.org/repos/asf/incubator-airflow/blob/c739adc6/airflow/contrib/operators/gcs_to_bq.py
----------------------------------------------------------------------
diff --git a/airflow/contrib/operators/gcs_to_bq.py b/airflow/contrib/operators/gcs_to_bq.py
index 97b1ef0..be15e52 100644
--- a/airflow/contrib/operators/gcs_to_bq.py
+++ b/airflow/contrib/operators/gcs_to_bq.py
@@ -66,6 +66,12 @@ class GoogleCloudStorageToBigQueryOperator(BaseOperator):
     :type max_bad_records: int
     :param quote_character: The value that is used to quote data sections in a CSV file.
     :type quote_character: string
+    :param ignore_unknown_values: [Optional] Indicates if BigQuery should allow
+        extra values that are not represented in the table schema.
+        If true, the extra values are ignored. If false, records with extra columns
+        are treated as bad records, and if there are too many bad records, an
+        invalid error is returned in the job result.
+    :type ignore_unknown_values: bool
     :param allow_quoted_newlines: Whether to allow quoted newlines (true) or not (false).
     :type allow_quoted_newlines: boolean
     :param allow_jagged_rows: Accept rows that are missing trailing optional columns.
@@ -124,6 +130,7 @@ class GoogleCloudStorageToBigQueryOperator(BaseOperator):
                  field_delimiter=',',
                  max_bad_records=0,
                  quote_character=None,
+                 ignore_unknown_values=False,
                  allow_quoted_newlines=False,
                  allow_jagged_rows=False,
                  max_id_key=None,
@@ -154,6 +161,7 @@ class GoogleCloudStorageToBigQueryOperator(BaseOperator):
         self.field_delimiter = field_delimiter
         self.max_bad_records = max_bad_records
         self.quote_character = quote_character
+        self.ignore_unknown_values = ignore_unknown_values
         self.allow_quoted_newlines = allow_quoted_newlines
         self.allow_jagged_rows = allow_jagged_rows
         self.external_table = external_table
@@ -198,6 +206,7 @@ class GoogleCloudStorageToBigQueryOperator(BaseOperator):
                 field_delimiter=self.field_delimiter,
                 max_bad_records=self.max_bad_records,
                 quote_character=self.quote_character,
+                ignore_unknown_values=self.ignore_unknown_values,
                 allow_quoted_newlines=self.allow_quoted_newlines,
                 allow_jagged_rows=self.allow_jagged_rows,
                 src_fmt_configs=self.src_fmt_configs
@@ -214,6 +223,7 @@ class GoogleCloudStorageToBigQueryOperator(BaseOperator):
                 field_delimiter=self.field_delimiter,
                 max_bad_records=self.max_bad_records,
                 quote_character=self.quote_character,
+                ignore_unknown_values=self.ignore_unknown_values,
                 allow_quoted_newlines=self.allow_quoted_newlines,
                 allow_jagged_rows=self.allow_jagged_rows,
                 schema_update_options=self.schema_update_options,


Mime
View raw message