Skip to content

Commit

Permalink
Check GenBank and RefSeq for data to download
Browse files Browse the repository at this point in the history
  • Loading branch information
sminot committed Jan 30, 2024
1 parent 23709f3 commit 552a1e9
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 2 deletions.
14 changes: 14 additions & 0 deletions bin/parse_preview_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env python3
"""Parse the preview.json produced by the NCBI datasets CLI"""

import json

dat = json.load(open('preview.json'))
print("Checking for number of records")
print(json.dumps(dat, indent=4))

for kw, val in dat["included_data_files"].items():
if val["file_count"] > 0:
with open("accession.has.data", "w") as handle:
handle.write('TRUE')
break
33 changes: 31 additions & 2 deletions templates/getDatasets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,34 @@

datasets --version

echo "Downloading accession ${dataset_acc}"
datasets download genome accession ${dataset_acc} --no-progressbar --include ${params.ncbi_datasets_type}
hasData(){
# Check to see if an accession has data available
echo "Checking for data in \$1"
datasets \
download \
genome \
accession \
\$1 \
--no-progressbar \
--include ${params.ncbi_datasets_type} \
--preview > preview.json

# Parse the number of records
parse_preview_json.py
if [ -s accession.has.data ]; then
echo "Data found for \$1"
return 0
else
echo "No data found for \$1"
return 1
fi
}

INPUT_ACC=${dataset_acc}
for ACC in \${INPUT_ACC} \${INPUT_ACC/A_/F_} \${INPUT_ACC/F_/A_}; do
if hasData \$ACC; then
echo "Downloading accession \${ACC}"
datasets download genome accession \${ACC} --no-progressbar --include ${params.ncbi_datasets_type}
break
fi
done

0 comments on commit 552a1e9

Please sign in to comment.