Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make MAXIMUM_SEED_SIZE_MIB configurable #7125

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
6 changes: 6 additions & 0 deletions .changes/unreleased/Features-20230307-134838.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
kind: Features
body: Make MAXIMUM_SEED_SIZE_MIB configurable
time: 2023-03-07T13:48:38.792321024Z
custom:
Author: noppaz acurtis-evi
Issue: 7117 7124
1 change: 1 addition & 0 deletions core/dbt/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def invoke(self, args: List[str], **kwargs) -> dbtRunnerResult:
@p.warn_error
@p.warn_error_options
@p.write_json
@p.maximum_seed_size_mib
def cli(ctx, **kwargs):
"""An ELT tool for managing your SQL transformations and data models.
For more documentation on these commands, visit: docs.getdbt.com
Expand Down
10 changes: 9 additions & 1 deletion core/dbt/cli/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@ def _version_callback(ctx, _param, value):
envvar="DBT_WARN_ERROR_OPTIONS",
default="{}",
help="""If dbt would normally warn, instead raise an exception based on include/exclude configuration. Examples include --select that selects nothing, deprecations, configurations with no associated models, invalid test configurations,
and missing sources/refs in tests. This argument should be a YAML string, with keys 'include' or 'exclude'. eg. '{"include": "all", "exclude": ["NoNodesForSelectionCriteria"]}'""",
and missing sources/refs in tests. This argument should be a JSON string, with keys 'include' or 'exclude'. eg. '{"include": "all", "exclude": ["NoNodesForSelectionCriteria"]}'""",
type=WarnErrorOptionsType(),
)

Expand All @@ -543,3 +543,11 @@ def _version_callback(ctx, _param, value):
help="Whether or not to write the manifest.json and run_results.json files to the target directory",
default=True,
)

maximum_seed_size_mib = click.option(
"--maximum-seed-size-mib",
envvar="DBT_MAXIMUM_SEED_SIZE_MIB",
help="Specify max size (MiB) for seed files that will be hashed for state comparison.",
type=click.INT,
default=1,
)
2 changes: 0 additions & 2 deletions core/dbt/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
DEFAULT_ENV_PLACEHOLDER = "DBT_DEFAULT_PLACEHOLDER"
METADATA_ENV_PREFIX = "DBT_ENV_CUSTOM_ENV_"

MAXIMUM_SEED_SIZE = 1 * 1024 * 1024
MAXIMUM_SEED_SIZE_NAME = "1MB"

PIN_PACKAGE_URL = (
"https://docs.getdbt.com/docs/package-management#section-specifying-package-versions"
Expand Down
30 changes: 26 additions & 4 deletions core/dbt/contracts/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
from mashumaro.types import SerializableType
from typing import List, Optional, Union, Dict, Any

from dbt.constants import MAXIMUM_SEED_SIZE

from dbt.clients.system import convert_path

from dbt.dataclass_schema import dbtClassMixin, StrEnum

from .util import SourceKey
Expand Down Expand Up @@ -63,9 +65,8 @@ def absolute_path(self) -> str:
def original_file_path(self) -> str:
return os.path.join(self.searched_path, self.relative_path)

def seed_too_large(self) -> bool:
"""Return whether the file this represents is over the seed size limit"""
return os.stat(self.full_path).st_size > MAXIMUM_SEED_SIZE
def file_size(self) -> int:
return os.stat(self.full_path).st_size


@dataclass
Expand Down Expand Up @@ -107,6 +108,27 @@ def from_contents(cls, contents: str, name="sha256") -> "FileHash":
checksum = hashlib.new(name, data).hexdigest()
return cls(name=name, checksum=checksum)

@classmethod
def from_path(cls, path: str, name="sha256") -> "FileHash":
"""Create a file hash from the file at given path. The hash is always the
utf-8 encoding of the contents which is stripped to give similar hashes
as `FileHash.from_contents`.
"""
path = convert_path(path)
chunk_size = 1 * 1024 * 1024
file_hash = hashlib.new(name)
with open(path, "r") as handle:
# Left and rightstrip start and end of contents to give identical
# results as the seed hashing implementation with from_contents
chunk = handle.read(chunk_size).lstrip()
while chunk:
next_chunk = handle.read(chunk_size)
if not next_chunk:
chunk = chunk.rstrip()
file_hash.update(chunk.encode("utf-8"))
chunk = next_chunk
return cls(name=name, checksum=file_hash.hexdigest())


@dataclass
class RemoteFile(dbtClassMixin):
Expand Down
1 change: 1 addition & 0 deletions core/dbt/contracts/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ class UserConfig(ExtensibleDbtClassMixin, Replaceable, UserConfigContract):
log_format_file: Optional[str] = None
log_level: Optional[str] = None
log_level_file: Optional[str] = None
maximum_seed_size_mib: Optional[int] = None
partial_parse: Optional[bool] = None
populate_cache: Optional[bool] = None
printer_width: Optional[int] = None
Expand Down
17 changes: 11 additions & 6 deletions core/dbt/events/types.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json

from dbt.ui import line_wrap_message, warning_tag, red, green, yellow
from dbt.constants import MAXIMUM_SEED_SIZE_NAME, PIN_PACKAGE_URL
from dbt.constants import PIN_PACKAGE_URL
from dbt.events.base_types import (
DynamicLevel,
DebugLevel,
Expand All @@ -12,6 +12,7 @@
)
from dbt.events.format import format_fancy_output_line, pluralize, timestamp_to_datetime_string

from dbt.flags import get_flags
from dbt.node_types import NodeType


Expand Down Expand Up @@ -46,6 +47,10 @@ def format_adapter_message(name, base_msg, args) -> str:
return f"{name} adapter: {msg}"


def get_maximum_seed_size_name() -> str:
return str(get_flags().MAXIMUM_SEED_SIZE_MIB) + "MiB"


# =======================================================
# A - Pre-project loading
# =======================================================
Expand Down Expand Up @@ -966,8 +971,8 @@ def code(self):
def message(self) -> str:
msg = (
f"Found a seed ({self.package_name}.{self.name}) "
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file was "
f"<={MAXIMUM_SEED_SIZE_NAME}, so it has changed"
f">{get_maximum_seed_size_name()} in size. The previous file was "
f"<={get_maximum_seed_size_name()}, so it has changed"
)
return msg

Expand All @@ -979,7 +984,7 @@ def code(self):
def message(self) -> str:
msg = (
f"Found a seed ({self.package_name}.{self.name}) "
f">{MAXIMUM_SEED_SIZE_NAME} in size at the same path, dbt "
f">{get_maximum_seed_size_name()} in size at the same path, dbt "
f"cannot tell if it has changed: assuming they are the same"
)
return msg
Expand All @@ -992,7 +997,7 @@ def code(self):
def message(self) -> str:
msg = (
f"Found a seed ({self.package_name}.{self.name}) "
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file was in "
f">{get_maximum_seed_size_name()} in size. The previous file was in "
f"a different location, assuming it has changed"
)
return msg
Expand All @@ -1005,7 +1010,7 @@ def code(self):
def message(self) -> str:
msg = (
f"Found a seed ({self.package_name}.{self.name}) "
f">{MAXIMUM_SEED_SIZE_NAME} in size. The previous file had a "
f">{get_maximum_seed_size_name()} in size. The previous file had a "
f"checksum type of {self.checksum_name}, so it has changed"
)
return msg
Expand Down
1 change: 1 addition & 0 deletions core/dbt/flags.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def get_flag_dict():
"introspect",
"target_path",
"log_path",
"maximum_seed_size_mib",
}
return {key: getattr(GLOBAL_FLAGS, key.upper(), None) for key in flag_attr}

Expand Down
14 changes: 11 additions & 3 deletions core/dbt/parser/read_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from typing import Optional, Dict, List, Mapping
from dbt.events.types import InputFileDiffError
from dbt.events.functions import fire_event
from dbt.flags import get_flags


@dataclass
Expand All @@ -37,6 +38,12 @@ class FileDiff(dbtClassMixin):
added: List[InputFile]


def get_max_seed_size() -> int:
"""The maximum seed size (MiB) that will be hashed for state comparison."""
flags = get_flags()
return flags.MAXIMUM_SEED_SIZE_MIB * 1024 * 1024


# This loads the files contents and creates the SourceFile object
def load_source_file(
path: FilePath,
Expand Down Expand Up @@ -113,12 +120,13 @@ def validate_yaml(file_path, dct):

# Special processing for big seed files
def load_seed_source_file(match: FilePath, project_name) -> SourceFile:
if match.seed_too_large():
maximum_seed_size = get_max_seed_size()
# maximum_seed_size = 0 means no limit
if match.file_size() > maximum_seed_size and maximum_seed_size != 0:
# We don't want to calculate a hash of this file. Use the path.
source_file = SourceFile.big_seed(match)
else:
file_contents = load_file_contents(match.absolute_path, strip=True)
checksum = FileHash.from_contents(file_contents)
checksum = FileHash.from_path(match.absolute_path)
source_file = SourceFile(path=match, checksum=checksum)
source_file.contents = ""
source_file.parse_file_type = ParseFileType.Seed
Expand Down
4 changes: 2 additions & 2 deletions test/unit/test_graph_selector_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -1191,7 +1191,7 @@ def test_select_state_changed_seed_checksum_path_to_path(manifest, previous_stat
event = warn_or_error_patch.call_args[0][0]
assert type(event).__name__ == "SeedExceedsLimitSamePath"
msg = event.message()
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
with mock.patch("dbt.contracts.graph.nodes.warn_or_error") as warn_or_error_patch:
assert not search_manifest_using_method(manifest, method, "new")
warn_or_error_patch.assert_not_called()
Expand All @@ -1208,7 +1208,7 @@ def test_select_state_changed_seed_checksum_sha_to_path(manifest, previous_state
event = warn_or_error_patch.call_args[0][0]
assert type(event).__name__ == "SeedIncreased"
msg = event.message()
assert msg.startswith("Found a seed (pkg.seed) >1MB in size")
assert msg.startswith("Found a seed (pkg.seed) >1MiB in size")
with mock.patch("dbt.contracts.graph.nodes.warn_or_error") as warn_or_error_patch:
assert not search_manifest_using_method(manifest, method, "new")
warn_or_error_patch.assert_not_called()
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/defer_state/test_modified_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def test_changed_seed_contents_state(self, project):
"./state",
]
)
assert ">1MB" in str(exc.value)
assert ">1MiB" in str(exc.value)

shutil.rmtree("./state")
self.copy_state()
Expand Down