From 83047eb325abf74ef9e0ba29f14f69ae37b95455 Mon Sep 17 00:00:00 2001 From: Cristhian Garcia Date: Wed, 28 Aug 2024 13:40:32 -0500 Subject: [PATCH 1/2] feat: create dataset for tagging docs: add documentation for course_tags fix: avoid tag duplication fix: split subtags fix: split subtags fix: split subtags tmp: try new datasets fix: add sources fix: add sources fix: add sources fix: add sources fix: add sources fix: add sources fix: add sources fix: add sources fix: add sources fix: add sources fix: add sources fix: add sources fix: turn most_recent views into MV fix: remove duplicated schema fix: sink only object tag id fix: sink only object tag id fix: add value to most_recent_tags dataset fix: add value to most_recent_tags dataset feat: add taxonomy name for course_tags fix: add final keywod to course tags perf: migrate course tags to a dictionary docs: update schema --- models/tags/course_tags.sql | 24 ++++++ models/tags/most_recent_course_tags.sql | 31 ++++++++ models/tags/most_recent_object_tags.sql | 23 ++++++ models/tags/most_recent_tags.sql | 23 ++++++ models/tags/most_recent_taxonomies.sql | 23 ++++++ models/tags/schema.yml | 97 +++++++++++++++++++++++++ models/tags/sources.yml | 36 +++++++++ 7 files changed, 257 insertions(+) create mode 100644 models/tags/course_tags.sql create mode 100644 models/tags/most_recent_course_tags.sql create mode 100644 models/tags/most_recent_object_tags.sql create mode 100644 models/tags/most_recent_tags.sql create mode 100644 models/tags/most_recent_taxonomies.sql create mode 100644 models/tags/schema.yml create mode 100644 models/tags/sources.yml diff --git a/models/tags/course_tags.sql b/models/tags/course_tags.sql new file mode 100644 index 0000000..a5c7e25 --- /dev/null +++ b/models/tags/course_tags.sql @@ -0,0 +1,24 @@ +{{ + config( + materialized="dictionary", + schema=env_var("ASPECTS_EVENT_SINK_DATABASE", "event_sink"), + fields=[ + ("course_key", "String"), + ("tag", "String"), + ("course_name", "String"), + ("taxonomy_name", "String"), + ("lineage", "String"), + ], + primary_key="(course_key, tag)", + layout="COMPLEX_KEY_HASHED()", + lifetime=env_var("ASPECTS_COURSE_NAME_CACHE_LIFETIME", "120"), + source_type="clickhouse", + connection_overrides={ + "host": "localhost", + }, + ) +}} + +select course_key, tag, course_name, taxonomy_name, lineage +from {{ ref("most_recent_course_tags") }} +order by course_key diff --git a/models/tags/most_recent_course_tags.sql b/models/tags/most_recent_course_tags.sql new file mode 100644 index 0000000..b0c5363 --- /dev/null +++ b/models/tags/most_recent_course_tags.sql @@ -0,0 +1,31 @@ +with + most_recent_overviews as ( + select org, course_key, max(time_last_dumped) as last_modified + from {{ source("event_sink", "course_overviews") }} + group by org, course_key + ), + most_recent_course_tags as ( + select + course_key, + display_name as course_name, + splitByString('+', course_key)[-1] as course_run, + org, + JSONExtract(course_data_json, 'tags', 'String') as tags_str + from {{ source("event_sink", "course_overviews") }} co + inner join + most_recent_overviews mro + on co.org = mro.org + and co.course_key = mro.course_key + and co.time_last_dumped = mro.last_modified + ), + parsed_tags as ( + select + course_key, + course_name, + arrayJoin(JSONExtractArrayRaw(tags_str))::Int32 as tag_id + from most_recent_course_tags + ) +select course_key, course_name, tag_id, value as tag, lineage, mrt.name as taxonomy_name +from parsed_tags +inner join {{ ref("most_recent_tags") }} mrot FINAL on mrot.id = tag_id +inner join {{ ref("most_recent_taxonomies") }} mrt FINAL on mrt.id = mrot.taxonomy diff --git a/models/tags/most_recent_object_tags.sql b/models/tags/most_recent_object_tags.sql new file mode 100644 index 0000000..0884d25 --- /dev/null +++ b/models/tags/most_recent_object_tags.sql @@ -0,0 +1,23 @@ +{{ + config( + materialized="materialized_view", + schema=env_var("ASPECTS_EVENT_SINK_DATABASE", "event_sink"), + engine=get_engine("ReplacingMergeTree()"), + order_by="(id)", + post_hook="OPTIMIZE TABLE {{ this }} {{ on_cluster() }} FINAL", + ) +}} +with + latest as ( + select id, max(time_last_dumped) as last_modified + from {{ source("event_sink", "object_tag") }} + group by id + ), + most_recent as ( + select id, object_id, taxonomy, _value, _export_id, lineage + from {{ source("event_sink", "object_tag") }} ot + inner join + latest mrot on mrot.id = ot.id and ot.time_last_dumped = mrot.last_modified + ) +select * +from most_recent diff --git a/models/tags/most_recent_tags.sql b/models/tags/most_recent_tags.sql new file mode 100644 index 0000000..909cdb1 --- /dev/null +++ b/models/tags/most_recent_tags.sql @@ -0,0 +1,23 @@ +{{ + config( + materialized="materialized_view", + schema=env_var("ASPECTS_EVENT_SINK_DATABASE", "event_sink"), + engine=get_engine("ReplacingMergeTree()"), + order_by="(id)", + post_hook="OPTIMIZE TABLE {{ this }} {{ on_cluster() }} FINAL", + ) +}} +with + latest as ( + select id, max(time_last_dumped) as last_modified + from {{ source("event_sink", "tag") }} + group by id + ), + most_recent as ( + select id, taxonomy, parent, value, external_id, lineage + from {{ source("event_sink", "tag") }} ot + inner join + latest mrot on mrot.id = ot.id and ot.time_last_dumped = mrot.last_modified + ) +select * +from most_recent diff --git a/models/tags/most_recent_taxonomies.sql b/models/tags/most_recent_taxonomies.sql new file mode 100644 index 0000000..6a12a2a --- /dev/null +++ b/models/tags/most_recent_taxonomies.sql @@ -0,0 +1,23 @@ +{{ + config( + materialized="materialized_view", + schema=env_var("ASPECTS_EVENT_SINK_DATABASE", "event_sink"), + engine=get_engine("ReplacingMergeTree()"), + order_by="(id)", + post_hook="OPTIMIZE TABLE {{ this }} {{ on_cluster() }} FINAL", + ) +}} +with + latest as ( + select id, max(time_last_dumped) as last_modified + from {{ source("event_sink", "taxonomy") }} + group by id + ), + most_recent as ( + select id, name + from {{ source("event_sink", "taxonomy") }} ot + inner join + latest mrot on mrot.id = ot.id and ot.time_last_dumped = mrot.last_modified + ) +select * +from most_recent diff --git a/models/tags/schema.yml b/models/tags/schema.yml new file mode 100644 index 0000000..8e76394 --- /dev/null +++ b/models/tags/schema.yml @@ -0,0 +1,97 @@ +version: 2 + +models: + - name: course_tags + description: "Tag with lineage data for courses (dictionary)" + columns: + - name: course_key + data_type: String + description: "The course key for the course" + - name: tag + data_type: String + description: "The name of the tag" + - name: course_name + data_type: String + description: "The name of the course" + - name: taxonomy_name + data_type: String + description: "The name of the parent taxonomy of the tag" + - name: lineage + data_type: String + description: "Lineage data of the tag for parent tags" + + - name: most_recent_course_tags + description: "Tag with lineage data for objects (view)" + columns: + - name: course_key + data_type: String + description: "The course key for the course" + - name: course_name + data_type: String + description: "The name of the course" + - name: tag_id + data_type: String + description: "The ID of the tag" + - name: tag + data_type: String + description: "The name of the tag" + - name: lineage + data_type: String + description: "Lineage data of the tag for parent tags" + - name: taxonomy_name + data_type: String + description: "The name of the parent taxonomy of the tag" + + - name: most_recent_object_tags + description: "Latest object tags" + columns: + - name: id + data_type: Int32 + description: "The record ID" + - name: object_id + data_type: String + description: "The tagged object" + - name: taxonomy + data_type: Int32 + description: "Foreign key to the taxonomy" + - name: _value + data_type: String + description: "Tag string" + - name: _export_id + data_type: String + description: "Export ID" + - name: lineage + data_type: String + description: "Lineage data of the tag for parent tags" + + - name: most_recent_tags + description: "Latest object tags" + columns: + - name: id + data_type: Int32 + description: "The record ID" + - name: taxonomy + data_type: Int32 + description: "Foreign key to the taxonomy" + - name: parent + data_type: Int32 + description: "Foreign key to the parent tag" + - name: value + data_type: String + description: "Tag string" + - name: external_id + data_type: String + description: "External ID" + - name: lineage + data_type: String + description: "Lineage data of the tag for parent tags" + + - name: most_recent_taxonomies + description: "Latest object tags" + columns: + - name: id + data_type: Int32 + description: "The record ID" + - name: name + data_type: Int32 + description: "Name for the taxonomy" diff --git a/models/tags/sources.yml b/models/tags/sources.yml new file mode 100644 index 0000000..cfc4ede --- /dev/null +++ b/models/tags/sources.yml @@ -0,0 +1,36 @@ +version: 2 + +sources: + - name: event_sink + database: "{{ env_var('ASPECTS_EVENT_SINK_DATABASE', 'event_sink')}}" + tables: + + - name: object_tag + columns: + - name: id + - name: object_id + - name: taxonomy + - name: tag + - name: _value + - name: _export_id + - name: lineage + - name: dump_id + - name: time_last_dumped + + - name: taxonomy + columns: + - name: id + - name: name + - name: dump_id + - name: time_last_dumped + + - name: tag + columns: + - name: id + - name: taxonomy + - name: parent + - name: value + - name: external_id + - name: lineage + - name: dump_id + - name: time_last_dumped From 52e5cab6bf86cbd402f18241d0a77f2a1991fd61 Mon Sep 17 00:00:00 2001 From: Cristhian Garcia Date: Fri, 30 Aug 2024 12:02:30 -0500 Subject: [PATCH 2/2] fix: disable check_table_dependencies clickhouse setting for dbt --- .github/profiles.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/profiles.yml b/.github/profiles.yml index 0f52c1d..3f81b86 100644 --- a/.github/profiles.yml +++ b/.github/profiles.yml @@ -9,3 +9,6 @@ aspects: # this needs to match the profile in your dbt_project.yml file user: ch_admin password: 'ch_password' secure: False + + custom_settings: + check_table_dependencies: 0