Skip to content

Commit

Permalink
Make --event-time-start and --event-time-end mutually required (#…
Browse files Browse the repository at this point in the history
…10878)

* Stop validating that `--event-time-start` is before "current" time

In the next commit we'll be adding a validation that requires that `--event-time-start`
and `--event-time-end` are mutually required. That is, whenever one is specified,
the other is required. In that world, `--event-time-start` will never need to be compared
against the "current" time, because it'll never be run in conjunction with the "current"
time.

* Validate that `--event-time-start` and `--event-time-end` are mutually present

* Add changie doc for validation changes

* Alter functional microbatch tests to work with updated `event_time_start/end` reqs

We made it such that when `event_time_start` is specified, `event_time_end` must also
be specified (and vice versa). This broke numerous tests, in a few different ways:

1. There were tests that used `--event-time-start` without `--event-time-end` butg
were using event_time_start essentially as the `begin` time for models being initially
built or full refreshed. These tests could simply drop the `--event-time-start` and
instead rely on the `begin` value.

2. There was a test  that was trying to load a subset of the data _excluding_ some
data which would be captured by using `begin`. In this test we added an appropriate
`--event-time-end` as the `--event-time-start` was necessary to statisfy what the
test was testing

3. There was a test which was trying to ensure that two microbatch models would be
given the same "current" time. Because we wanted to ensure the "current" time code
path was used, we couldn't add `--event-time-end` to resolve the problem, thus we
needed to remove the `--event-time-start` that was being used. However, this led to
the test being incredibly slow. This was resolved by switching the relevant microbatch
models from having `batch_size`s of `day` to instead have `year`. This solution should
be good enough for roughly ~40 years? We'll figure out a better solution then, so see ya
in 2064. Assuming I haven't died before my 70th birthday, feel free to ping me to get
this taken care of.

---------

Co-authored-by: Michelle Ark <michelle.ark@dbtlabs.com>
  • Loading branch information
QMalcolm and MichelleArk authored Oct 29, 2024
1 parent 6b5db17 commit 8df5c96
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 46 deletions.
6 changes: 6 additions & 0 deletions .changes/unreleased/Fixes-20241017-145357.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
kind: Fixes
body: Make `--event-time-start` and `--event-time-end` mutually required
time: 2024-10-17T14:53:57.149238-07:00
custom:
Author: QMalcolm
Issue: "10874"
26 changes: 17 additions & 9 deletions core/dbt/cli/flags.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from pprint import pformat as pf
from typing import Any, Callable, Dict, List, Optional, Set, Union

import pytz
from click import Context, Parameter, get_current_context
from click.core import Command as ClickCommand
from click.core import Group, ParameterSource
Expand Down Expand Up @@ -362,17 +361,26 @@ def _validate_event_time_configs(self) -> None:
getattr(self, "EVENT_TIME_END") if hasattr(self, "EVENT_TIME_END") else None
)

if event_time_start is not None and event_time_end is not None:
if event_time_start >= event_time_end:
# only do validations if at least one of `event_time_start` or `event_time_end` are specified
if event_time_start is not None or event_time_end is not None:

# These `ifs`, combined with the parent `if` make it so that `event_time_start` and
# `event_time_end` are mutually required
if event_time_start is None:
raise DbtUsageException(
"Value for `--event-time-start` must be less than `--event-time-end`"
"The flag `--event-time-end` was specified, but `--event-time-start` was not. "
"When specifying `--event-time-end`, `--event-time-start` must also be present."
)
if event_time_end is None:
raise DbtUsageException(
"The flag `--event-time-start` was specified, but `--event-time-end` was not. "
"When specifying `--event-time-start`, `--event-time-end` must also be present."
)
elif event_time_start is not None:
utc_start = event_time_start.replace(tzinfo=pytz.UTC)
current_time = datetime.now(pytz.UTC)
if utc_start >= current_time:

# This `if` just is a sanity check that `event_time_start` is before `event_time_end`
if event_time_start >= event_time_end:
raise DbtUsageException(
f"Value for `--event-time-start` ({utc_start}) must be less than the current time ({current_time}) if `--event-time-end` is not specififed"
"Value for `--event-time-start` must be less than `--event-time-end`"
)

def fire_deprecations(self):
Expand Down
27 changes: 12 additions & 15 deletions tests/functional/cli/test_option_interaction_validations.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
from datetime import datetime

import pytest
from freezegun import freeze_time

from dbt.tests.util import run_dbt

Expand Down Expand Up @@ -34,19 +31,19 @@ def test_option_combo(self, project, event_time_start, event_time_end, expect_pa
assert not expect_pass


class TestEventTimeStartCurrent_time:
class TestEventTimeEndEventTimeStartMutuallyRequired:
@pytest.mark.parametrize(
"event_time_start,current_time,expect_pass",
"specified,missing",
[
("2024-10-01", "2024-10-02", True),
("2024-10-02", "2024-10-01", False),
("--event-time-start", "--event-time-end"),
("--event-time-end", "--event-time-start"),
],
)
def test_option_combo(self, project, event_time_start, current_time, expect_pass):
with freeze_time(datetime.fromisoformat(current_time)):
try:
run_dbt(["build", "--event-time-start", event_time_start])
assert expect_pass
except Exception as e:
assert "must be less than the current time" in e.__str__()
assert not expect_pass
def test_option_combo(self, project, specified, missing):
try:
run_dbt(["build", specified, "2024-10-01"])
assert False, f"An error should have been raised for missing `{missing}` flag"
except Exception as e:
assert (
f"When specifying `{specified}`, `{missing}` must also be present." in e.__str__()
)
50 changes: 28 additions & 22 deletions tests/functional/microbatch/test_microbatch.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import os
from datetime import datetime
from unittest import mock

import pytest
import pytz

from dbt.events.types import LogModelResult, MicrobatchModelNoEventTimeInputs
from dbt.tests.util import (
Expand Down Expand Up @@ -42,6 +40,16 @@
select * from {{ ref('input_model') }}
"""

microbatch_yearly_model_sql = """
{{ config(materialized='incremental', incremental_strategy='microbatch', unique_key='id', event_time='event_time', batch_size='year', begin=modules.datetime.datetime(2020, 1, 1, 0, 0, 0)) }}
select * from {{ ref('input_model') }}
"""

microbatch_yearly_model_downstream_sql = """
{{ config(materialized='incremental', incremental_strategy='microbatch', unique_key='id', event_time='event_time', batch_size='year', begin=modules.datetime.datetime(2020, 1, 1, 0, 0, 0)) }}
select * from {{ ref('microbatch_model') }}
"""

invalid_batch_context_macro_sql = """
{% macro check_invalid_batch_context() %}
Expand Down Expand Up @@ -232,15 +240,6 @@ def test_run_with_event_time(self, project):
# creation events
assert batch_creation_events == 3

# build model >= 2020-01-02
with patch_microbatch_end_time("2020-01-03 13:57:00"):
run_dbt(["run", "--event-time-start", "2020-01-02", "--full-refresh"])
self.assert_row_count(project, "microbatch_model", 2)

# build model < 2020-01-03
run_dbt(["run", "--event-time-end", "2020-01-03", "--full-refresh"])
self.assert_row_count(project, "microbatch_model", 2)

# build model between 2020-01-02 >= event_time < 2020-01-03
run_dbt(
[
Expand Down Expand Up @@ -466,7 +465,7 @@ def models(self):
@mock.patch.dict(os.environ, {"DBT_EXPERIMENTAL_MICROBATCH": "True"})
def test_run_with_event_time_logs(self, project):
with patch_microbatch_end_time("2020-01-03 13:57:00"):
_, logs = run_dbt_and_capture(["run", "--event-time-start", "2020-01-01"])
_, logs = run_dbt_and_capture(["run"])

assert "start: 2020-01-01 00:00:00+00:00" in logs
assert "end: 2020-01-02 00:00:00+00:00" in logs
Expand Down Expand Up @@ -500,7 +499,7 @@ def models(self):
def test_run_with_event_time(self, project):
# run all partitions from start - 2 expected rows in output, one failed
with patch_microbatch_end_time("2020-01-03 13:57:00"):
run_dbt(["run", "--event-time-start", "2020-01-01"], expect_pass=False)
run_dbt(["run"], expect_pass=False)
self.assert_row_count(project, "microbatch_model", 2)

run_results = get_artifact(project.project_root, "target", "run_results.json")
Expand All @@ -525,7 +524,7 @@ def models(self):
def test_run_with_event_time(self, project):
# run all partitions from start - 2 expected rows in output, one failed
with patch_microbatch_end_time("2020-01-03 13:57:00"):
_, console_output = run_dbt_and_capture(["run", "--event-time-start", "2020-01-01"])
_, console_output = run_dbt_and_capture(["run"])

assert "PARTIAL SUCCESS (2/3)" in console_output
assert "Completed with 1 partial success" in console_output
Expand Down Expand Up @@ -565,7 +564,7 @@ def models(self):
def test_run_with_event_time(self, project):
# run all partitions from start - 2 expected rows in output, one failed
with patch_microbatch_end_time("2020-01-03 13:57:00"):
_, console_output = run_dbt_and_capture(["run", "--event-time-start", "2020-01-01"])
_, console_output = run_dbt_and_capture(["run"])

assert "PARTIAL SUCCESS (2/3)" in console_output
assert "Completed with 1 partial success" in console_output
Expand Down Expand Up @@ -612,7 +611,7 @@ def models(self):
def test_run_with_event_time(self, project):
# run all partitions from start - 2 expected rows in output, one failed
with patch_microbatch_end_time("2020-01-03 13:57:00"):
run_dbt(["run", "--event-time-start", "2020-01-01"])
run_dbt(["run"])
self.assert_row_count(project, "microbatch_model", 2)


Expand All @@ -621,7 +620,7 @@ class TestMicrobatchCompiledRunPaths(BaseMicrobatchTest):
def test_run_with_event_time(self, project):
# run all partitions from start - 2 expected rows in output, one failed
with patch_microbatch_end_time("2020-01-03 13:57:00"):
run_dbt(["run", "--event-time-start", "2020-01-01"])
run_dbt(["run"])

# Compiled paths - compiled model without filter only
assert read_file(
Expand Down Expand Up @@ -704,7 +703,15 @@ def models(self):
def test_run_with_event_time(self, project):
# run all partitions from 2020-01-02 to spoofed "now" - 2 expected rows in output
with patch_microbatch_end_time("2020-01-03 13:57:00"):
run_dbt(["run", "--event-time-start", "2020-01-02"])
run_dbt(
[
"run",
"--event-time-start",
"2020-01-02",
"--event-time-end",
"2020-01-03 13:57:00",
]
)
self.assert_row_count(project, "microbatch_model", 2)

# re-running shouldn't change what it's in the data set because there is nothing new
Expand Down Expand Up @@ -733,14 +740,13 @@ class TestMicrbobatchModelsRunWithSameCurrentTime(BaseMicrobatchTest):
def models(self):
return {
"input_model.sql": input_model_sql,
"microbatch_model.sql": microbatch_model_sql,
"second_microbatch_model.sql": microbatch_model_downstream_sql,
"microbatch_model.sql": microbatch_yearly_model_sql,
"second_microbatch_model.sql": microbatch_yearly_model_downstream_sql,
}

@mock.patch.dict(os.environ, {"DBT_EXPERIMENTAL_MICROBATCH": "True"})
def test_microbatch(self, project) -> None:
current_time = datetime.now(pytz.UTC)
run_dbt(["run", "--event-time-start", current_time.strftime("%Y-%m-%d")])
run_dbt(["run"])

run_results = get_artifact(project.project_root, "target", "run_results.json")
microbatch_model_last_batch = run_results["results"][1]["batch_results"]["successful"][-1]
Expand Down

0 comments on commit 8df5c96

Please sign in to comment.