Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: create dataset for tagging #118

Merged
merged 2 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/profiles.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@ aspects: # this needs to match the profile in your dbt_project.yml file
user: ch_admin
password: 'ch_password'
secure: False

custom_settings:
check_table_dependencies: 0
24 changes: 24 additions & 0 deletions models/tags/course_tags.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{{
config(
materialized="dictionary",
schema=env_var("ASPECTS_EVENT_SINK_DATABASE", "event_sink"),
fields=[
("course_key", "String"),
("tag", "String"),
("course_name", "String"),
("taxonomy_name", "String"),
("lineage", "String"),
],
primary_key="(course_key, tag)",
layout="COMPLEX_KEY_HASHED()",
lifetime=env_var("ASPECTS_COURSE_NAME_CACHE_LIFETIME", "120"),
source_type="clickhouse",
connection_overrides={
"host": "localhost",
},
)
}}

select course_key, tag, course_name, taxonomy_name, lineage
from {{ ref("most_recent_course_tags") }}
order by course_key
31 changes: 31 additions & 0 deletions models/tags/most_recent_course_tags.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
with
most_recent_overviews as (
select org, course_key, max(time_last_dumped) as last_modified
from {{ source("event_sink", "course_overviews") }}
group by org, course_key
),
most_recent_course_tags as (
select
course_key,
display_name as course_name,
splitByString('+', course_key)[-1] as course_run,
org,
JSONExtract(course_data_json, 'tags', 'String') as tags_str
from {{ source("event_sink", "course_overviews") }} co
inner join
most_recent_overviews mro
on co.org = mro.org
and co.course_key = mro.course_key
and co.time_last_dumped = mro.last_modified
),
parsed_tags as (
select
course_key,
course_name,
arrayJoin(JSONExtractArrayRaw(tags_str))::Int32 as tag_id
from most_recent_course_tags
)
select course_key, course_name, tag_id, value as tag, lineage, mrt.name as taxonomy_name
from parsed_tags
inner join {{ ref("most_recent_tags") }} mrot FINAL on mrot.id = tag_id
inner join {{ ref("most_recent_taxonomies") }} mrt FINAL on mrt.id = mrot.taxonomy
23 changes: 23 additions & 0 deletions models/tags/most_recent_object_tags.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{{
config(
materialized="materialized_view",
schema=env_var("ASPECTS_EVENT_SINK_DATABASE", "event_sink"),
engine=get_engine("ReplacingMergeTree()"),
order_by="(id)",
post_hook="OPTIMIZE TABLE {{ this }} {{ on_cluster() }} FINAL",
)
}}
with
latest as (
select id, max(time_last_dumped) as last_modified
from {{ source("event_sink", "object_tag") }}
group by id
),
most_recent as (
select id, object_id, taxonomy, _value, _export_id, lineage
from {{ source("event_sink", "object_tag") }} ot
inner join
latest mrot on mrot.id = ot.id and ot.time_last_dumped = mrot.last_modified
)
select *
from most_recent
23 changes: 23 additions & 0 deletions models/tags/most_recent_tags.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{{
config(
materialized="materialized_view",
schema=env_var("ASPECTS_EVENT_SINK_DATABASE", "event_sink"),
engine=get_engine("ReplacingMergeTree()"),
order_by="(id)",
post_hook="OPTIMIZE TABLE {{ this }} {{ on_cluster() }} FINAL",
)
}}
with
latest as (
select id, max(time_last_dumped) as last_modified
from {{ source("event_sink", "tag") }}
group by id
),
most_recent as (
select id, taxonomy, parent, value, external_id, lineage
from {{ source("event_sink", "tag") }} ot
inner join
latest mrot on mrot.id = ot.id and ot.time_last_dumped = mrot.last_modified
)
select *
from most_recent
23 changes: 23 additions & 0 deletions models/tags/most_recent_taxonomies.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{{
config(
materialized="materialized_view",
schema=env_var("ASPECTS_EVENT_SINK_DATABASE", "event_sink"),
engine=get_engine("ReplacingMergeTree()"),
order_by="(id)",
post_hook="OPTIMIZE TABLE {{ this }} {{ on_cluster() }} FINAL",
)
}}
with
latest as (
select id, max(time_last_dumped) as last_modified
from {{ source("event_sink", "taxonomy") }}
group by id
),
most_recent as (
select id, name
from {{ source("event_sink", "taxonomy") }} ot
inner join
latest mrot on mrot.id = ot.id and ot.time_last_dumped = mrot.last_modified
)
select *
from most_recent
97 changes: 97 additions & 0 deletions models/tags/schema.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
version: 2

models:
- name: course_tags
description: "Tag with lineage data for courses (dictionary)"
columns:
- name: course_key
data_type: String
description: "The course key for the course"
- name: tag
data_type: String
description: "The name of the tag"
- name: course_name
data_type: String
description: "The name of the course"
- name: taxonomy_name
data_type: String
description: "The name of the parent taxonomy of the tag"
- name: lineage
data_type: String
description: "Lineage data of the tag for parent tags"

- name: most_recent_course_tags
description: "Tag with lineage data for objects (view)"
columns:
- name: course_key
data_type: String
description: "The course key for the course"
- name: course_name
data_type: String
description: "The name of the course"
- name: tag_id
data_type: String
description: "The ID of the tag"
- name: tag
data_type: String
description: "The name of the tag"
- name: lineage
data_type: String
description: "Lineage data of the tag for parent tags"
- name: taxonomy_name
data_type: String
description: "The name of the parent taxonomy of the tag"

- name: most_recent_object_tags
description: "Latest object tags"
columns:
- name: id
data_type: Int32
description: "The record ID"
- name: object_id
data_type: String
description: "The tagged object"
- name: taxonomy
data_type: Int32
description: "Foreign key to the taxonomy"
- name: _value
data_type: String
description: "Tag string"
- name: _export_id
data_type: String
description: "Export ID"
- name: lineage
data_type: String
description: "Lineage data of the tag for parent tags"

- name: most_recent_tags
description: "Latest object tags"
columns:
- name: id
data_type: Int32
description: "The record ID"
- name: taxonomy
data_type: Int32
description: "Foreign key to the taxonomy"
- name: parent
data_type: Int32
description: "Foreign key to the parent tag"
- name: value
data_type: String
description: "Tag string"
- name: external_id
data_type: String
description: "External ID"
- name: lineage
data_type: String
description: "Lineage data of the tag for parent tags"

- name: most_recent_taxonomies
description: "Latest object tags"
columns:
- name: id
data_type: Int32
description: "The record ID"
- name: name
data_type: Int32
description: "Name for the taxonomy"
36 changes: 36 additions & 0 deletions models/tags/sources.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
version: 2

sources:
- name: event_sink
database: "{{ env_var('ASPECTS_EVENT_SINK_DATABASE', 'event_sink')}}"
tables:

- name: object_tag
columns:
- name: id
- name: object_id
- name: taxonomy
- name: tag
- name: _value
- name: _export_id
- name: lineage
- name: dump_id
- name: time_last_dumped

- name: taxonomy
columns:
- name: id
- name: name
- name: dump_id
- name: time_last_dumped

- name: tag
columns:
- name: id
- name: taxonomy
- name: parent
- name: value
- name: external_id
- name: lineage
- name: dump_id
- name: time_last_dumped
Loading