Skip to content

Commit

Permalink
filter out any files not ending with json or jsonl (#6)
Browse files Browse the repository at this point in the history
  • Loading branch information
samos123 authored Dec 16, 2023
1 parent dec3d29 commit 4a810bb
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 2 deletions.
6 changes: 5 additions & 1 deletion batchelor/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,18 @@ def parse_bucket(path: str) -> str:
return path.split("/")[2]


def filter_json_files(paths: list[str]) -> list[str]:
return [path for path in paths if path.endswith(".json") or path.endswith(".jsonl")]


def convert_path_to_list(path: str) -> list[str]:
if path.startswith("gs://"):
bucket_name = parse_bucket(path)
paths = []
client = storage.Client()
for blob in client.list_blobs(bucket_name, prefix=path):
paths.append(f"gs://{bucket_name}/{blob.name}")
return paths
return filter_json_files(paths)
return [path]


Expand Down
5 changes: 4 additions & 1 deletion tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,13 @@ def test_convert_path_to_list_multiple(mocker, mock_client):
mock_client.return_value.list_blobs.return_value = [
Blob(name="path/file1.jsonl"),
Blob(name="path/file2.jsonl"),
Blob(name="path/file3.json"),
Blob(name="path/file4"),
]

output = convert_path_to_list(path)
assert mock_client.return_value.list_blobs.call_count == 1
assert len(output) == 2
assert len(output) == 3
assert output[0] == path + "/file1.jsonl"
assert output[1] == path + "/file2.jsonl"
assert output[2] == path + "/file3.json"

0 comments on commit 4a810bb

Please sign in to comment.