Eg: behaviours uses field 'entity_fields'
A configuration table in the database indicates to the job which information to extract.
Table: datasync_entities
Fields (PK entity_schema, entity_name):
entity_schema: varchar not null, default "public"
entity_name: varchar not null
entity_fields: varchar not null, default "*"
Note: If entity_fields is different than "*" then it must contain at least the technical fields id, modified_instant and deleted_instant (comma separated)
Eg: dagger
Let's see how to configure log groups retention if they already exist or create them otherwise.
AWS Glue ETL jobs of type "pythonshell" can only use the default log groups (created implicitly):
/aws-glue/python-jobs/output
/aws-glue/python-jobs/error
By default, these log groups are created without retention configuration.
main.tf
# For checking if Glue pythonshell log groups exist
data "aws_cloudwatch_log_groups" "glue_existing" {
log_group_name_prefix = "/aws-glue/python-jobs/"
}
locals {
glue_existing_log_groups = toset([
for lg in data.aws_cloudwatch_log_groups.glue_existing.log_group_names : lg
if contains(["/aws-glue/python-jobs/output", "/aws-glue/python-jobs/error"], lg)
])
}
# Glue: Import existing log groups.
# Import blocks are only allowed in the root module.
import {
for_each = local.glue_existing_log_groups
to = aws_cloudwatch_log_group.python_jobs[each.value]
id = each.value
}
# Manage log groups (create if not exist, update retention if exist)
resource "aws_cloudwatch_log_group" "python_jobs" {
for_each = toset(["/aws-glue/python-jobs/output", "/aws-glue/python-jobs/error"])
name = each.value
retention_in_days = local.cloudwatch_retention_in_days
}
For AWS Glue Python shell jobs, you cannot set a custom log stream prefix or change the default log groups via Terraform or job arguments.
AWS Glue Python shell jobs are restricted to the following hardcoded CloudWatch log groups:
/aws-glue/python-jobs/output
/aws-glue/python-jobs/error
The log stream names are automatically set to the job_run_id and cannot be prefixed.
Workaround: Custom Logging in Python
If you need a custom prefix or a dedicated log group to separate logs by environment or job name, you must implement it directly in your Python script using boto3 or a library like watchtower.
For example, you can add this to your Python script to send logs to a custom location:
import logging
import watchtower, boto3
# Initialize custom logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# This allows you to set a custom log group and stream prefix
logger.addHandler(watchtower.CloudWatchLogHandler(
log_group='/my-app/glue-jobs',
stream_name='my-prefix-{strftime:%Y-%m-%d}-' + job_run_id
))
logger.info("This will appear with your specified prefix!")
IMPORTANT:
Standard print() statements and system errors will still go to the default /aws-glue/python-jobs/ groups regardless of any custom logger you implement.
Eg: docrepo (s3 naming); behaviours; thanos (deleted_instant and datasync_entities)
# pythonshell jobs ONLY use default logs /aws-glue/python-jobs/output & /aws-glue/python-jobs/error
#
resource "aws_glue_job" "deploy_script" {
command {
name = "pythonshell"
script_location = "s3://${replace(var.s3_coderepository_arn, "arn:aws:s3:::", "")}/${var.glue_bucket_coderepository_script_key}"
python_version = "3.9" #(3.9 when glue v4 or v3) See: https://docs.aws.amazon.com/glue/latest/dg/release-notes.html
}
connections = [aws_glue_connection.rds_connection.name]
default_arguments = {
"!WARNING" = "All values are managed by Terraform: next apply will replace them!" #W/out '--' (nothing uses it)
# ---- [1/2]Special Parameters Used by AWS Glue ---- https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html
"library-set" = "analytics"
#"--continuous-log-logGroup" = aws_cloudwatch_log_group.glue_lg.name - only work for Spark ETL jobs
#"--continuous-log-logStreamPrefix" = "${local.ci}-${local.env}" - only work for Spark ETL jobs
#"--enable-continuous-cloudwatch-log" = "true" - only work for Spark ETL jobs
#"--enable-continuous-log-filter" = "true" - only work for Spark ETL jobs
#"--enable-auto-scaling" = "true"
"--enable-job-insights" = "false"
"--enable-metrics" = "true"
"--enable-observability-metrics" = "false"
"--job-language" = "python"
# ---- [2/2]Calling AWS Glue APIs in Python ---- https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/glue_job#default_arguments-1
"--AWS_ACCESS_KEY_ID" = var.glue_job_aws_ak #Argument 1
"--AWS_SECRET_ACCESS_KEY" = var.glue_job_aws_sak #Argument 2
"--BBDD_HOST" = var.db_host #Used by the script for the PostgreSQL connection
"--BBDD_PORT" = var.db_port #Used by the script for the PostgreSQL connection
"--BBDD_USER" = var.db_user #Used by the script for the PostgreSQL connection
"--BBDD_PASSWORD" = var.db_pass #Used by the script for the PostgreSQL connection
"--BBDD_GLUE_ENTITIES_TABLE_NAME" = var.db_table_glue_entities #DB table with the field 'entity_name'
"--BBDD_GLUE_ENTITIES_DELTA_DAYS" = "7"
"--ENV" = local.env #Argument 9
"--GLUE_ROLE_ARN" = var.glue_role_arn #Argument 10
"--GLUE_S3_AUDITPROCESS_NAME" = var.audit_bucket_name #Argument 11
"--CI" = local.ci #Argument 12
# "--extra-py-files" = "s3://${aws_s3_object.snowflake_library.bucket}/${aws_s3_object.snowflake_library.key}"
}
description = "${local.ci} Glue job"
execution_class = "STANDARD" #STANDARD or FLEX
#execution_property = (Optional) Execution property of the job
#DOC glue_version> https://docs.aws.amazon.com/glue/latest/dg/release-notes.html
glue_version = null #Null if PythonShell. Glue 4.0-->Python 3.9. Glue 3.0-->Python 3.9
job_mode = "SCRIPT" #(Optional) Describes how a job was created
job_run_queuing_enabled = true
#maintenance_window= (Optional)
max_capacity = 0.0625 #Conflicts w/ worker_type 'G.1X'
max_retries = 0
name = var.glue-job-name
#non_overridable_arguments=
#notification_property=
#number_of_workers = 1 #Required for FLEX jobs
#region=
role_arn = var.glue_service_role_arn
#tags=
timeout = 480 #default is 480 minutes (8 hours) for a Glue 5.0 ETL job
#security_configuration=
#source_control_details=
#worker_type = "G.1X" #Only [G.1X, G.2X] worker types are supported for FLEX jobs
}