# invalid_email This rule validates that varchar columns that represent an email have a valid email address. For example, consider the `email` column in the `contacts` table. The following dataset snippet should fail validation, ``` python pprint_csv_file(asset("invalid-dataset-1.csv"), "Invalid dataset") ```
Invalid dataset ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ contactID ┃ email ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ 1 │ john.doe │ └───────────────────────────────────────────────────────────┴─────────────────────────────────────────────────────┘whereas the following should pass, ``` python pprint_csv_file(asset("valid-dataset-1.csv"), "Valid dataset") ```
Valid dataset ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ contactID ┃ email ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ 1 │ john.doe@email.com │ └────────────────────────────────────────┴────────────────────────────────────────────────────────────────────────┘## Error report The error report will have the following fields - **errorType**: invalid_email - **tableName**: The name of the table whose row has the invalid email - **columnName** The name of the column with the invalid email - **rowNumber**: The index of the table row with the error - **row** The row in the data that failed this validation rule - **invalidValue**: The invalid email value - **validationRuleFields**: The ODM data dictionary rule fields violated by this row - **message**: Invalid email
{ │ 'errors': [ │ │ { │ │ │ 'errorType': 'invalid_email', │ │ │ 'tableName': 'contacts', │ │ │ 'columnName': 'email', │ │ │ 'rowNumber': 1, │ │ │ 'row': { │ │ │ │ 'contactID': '1', │ │ │ │ 'email': 'john.doe' │ │ │ }, │ │ │ 'invalidValue': 'john.doe', │ │ │ 'validationRuleFields': [], │ │ │ 'message': 'Invalid email john.doe found in row 1 for column email in table contacts' │ │ } │ ], │ 'warnings': [] }## Rule metadata The dictionary currently does not have any metadata to say if a column is an email or not. Instead we will be hardcoding this rule to a set of pre-determined email columns. For version 2 the email columns are: ``` python pprint_yaml_file(asset("version-2-email-columns.yml")) ```
[ │ { │ │ 'partID': 'email', │ │ 'table': 'contacts' │ } ]In the above file, - The `partID` field contains the name of the email column and - The `table` field contains the name of the table that the part is a column in. If a parts sheet contains any of the above mentioned columns, then this validation rule should be added to them. For example, in the following parts sheet snippet this rule should be added to all columns except for `geoLat`. ``` python pprint_csv_file(asset("parts.csv"), title = "Parts v2", ignore_prefix = "version1") ```
Parts v2 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ partID ┃ partType ┃ sites ┃ contacts ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ sites │ tables │ NA │ NA │ │ geoLat │ attributes │ header │ NA │ │ contacts │ tables │ NA │ NA │ │ email │ attributes │ NA │ header │ └────────────────────────────┴─────────────────────────────────┴──────────────────────┴───────────────────────────┘## Cerberus schema We will be using a custom rule called `is_email` to each column. Alternative appraoches and reasons for not using them are: 1. `type` rule: We would prefer to keep the value of this rule the same as the `dataType` column in the ODM 2. `regex` rule: Better than type but is less clear to a user of the schema what the regex is actually trying to validate. Underneath the hood the `is_email` rule will be using a regex to validate the column value. An example of the regex can be seen in this [stack overflow thread](https://stackoverflow.com/a/201378/1950599). For the parts snippet above the following schema should be generated, ``` python pprint_yaml_file(asset("schema-v2.yml")) ```
{ │ 'schemaVersion': '2.0.0', │ 'schema': { │ │ 'sites': { │ │ │ 'type': 'list', │ │ │ 'schema': { │ │ │ │ 'type': 'dict', │ │ │ │ 'email': { │ │ │ │ │ 'is_email': True, │ │ │ │ │ 'meta': [ │ │ │ │ │ │ { │ │ │ │ │ │ │ 'ruleID': 'invalid_email', │ │ │ │ │ │ │ 'meta': [ │ │ │ │ │ │ │ │ { │ │ │ │ │ │ │ │ │ 'partID': 'email', │ │ │ │ │ │ │ │ │ 'partType': 'attributes', │ │ │ │ │ │ │ │ │ 'contacts': 'header' │ │ │ │ │ │ │ │ } │ │ │ │ │ │ │ ] │ │ │ │ │ │ } │ │ │ │ │ ] │ │ │ │ } │ │ │ }, │ │ │ 'meta': [ │ │ │ │ { │ │ │ │ │ 'partID': 'sites', │ │ │ │ │ 'partType': 'tables' │ │ │ │ } │ │ │ ] │ │ } │ } }## Version 1 For version 1 schemas, we add this rule to the version 1 equivalents of the above mentioned version 2 email columns. In addition, this rule should also be added to the following version 1 only columns: ``` python pprint_yaml_file(asset("version-1-email-columns.yml")) ```
[ │ { │ │ 'partID': 'contactEmail', │ │ 'table': 'Lab' │ } ]For example, for the following version 1 parts snippet, ``` python pprint_csv_file(asset("parts.csv"), title = "Parts v1") ```
Parts v1 ┏━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓ ┃ partID ┃ partType ┃ sites ┃ contacts ┃ version1Location ┃ version1Table ┃ version1Variable ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩ │ sites │ tables │ NA │ NA │ tables │ Site │ NA │ │ geoLat │ attributes │ header │ NA │ variables │ Site │ Latitude │ │ contacts │ tables │ NA │ NA │ tables │ Contact │ NA │ │ email │ attributes │ NA │ header │ variables │ Contact │ contactEmail │ └────────────┴───────────────┴──────────┴────────────┴─────────────────────┴─────────────────┴────────────────────┘the following validation schema should be generated, ``` python pprint_yaml_file(asset("schema-v1.yml")) ```
{ │ 'schemaVersion': '1.0.0', │ 'schema': { │ │ 'Site': { │ │ │ 'type': 'list', │ │ │ 'schema': { │ │ │ │ 'type': 'dict', │ │ │ │ 'contactEmail': { │ │ │ │ │ 'is_email': True, │ │ │ │ │ 'meta': [ │ │ │ │ │ │ { │ │ │ │ │ │ │ 'ruleID': 'invalid_email', │ │ │ │ │ │ │ 'meta': [ │ │ │ │ │ │ │ │ { │ │ │ │ │ │ │ │ │ 'partID': 'email', │ │ │ │ │ │ │ │ │ 'partType': 'attributes', │ │ │ │ │ │ │ │ │ 'contacts': 'header', │ │ │ │ │ │ │ │ │ 'version1Location': 'variables', │ │ │ │ │ │ │ │ │ 'version1Table': 'Site', │ │ │ │ │ │ │ │ │ 'version1Variable': 'contactEmail' │ │ │ │ │ │ │ │ } │ │ │ │ │ │ │ ] │ │ │ │ │ │ } │ │ │ │ │ ] │ │ │ │ } │ │ │ }, │ │ │ 'meta': [ │ │ │ │ { │ │ │ │ │ 'partID': 'sites', │ │ │ │ │ 'partType': 'tables', │ │ │ │ │ 'version1Location': 'tables', │ │ │ │ │ 'version1Table': 'Site' │ │ │ │ } │ │ │ ] │ │ } │ } }