-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgather-READMEs.sh
executable file
·182 lines (138 loc) · 10.7 KB
/
gather-READMEs.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/bin/bash
if [[ "$1" == "-h" ]] ; then
cat << _usage_
Make some variables nonempty on the command line to skip the finds within the module and database trees, for example
SKIP_FIND=yes ./gather-READMEs.sh
to skip both find commands, or
SKIP_FIND_MODULES=yes ./gather-READMEs.sh
SKIP_FIND_DATABASES=yes ./gather-READMEs.sh
to skip a specific one
_usage_
exit 1
fi
set -x
set -e
SKIP_FIND_MODULES=${SKIP_FIND_MODULES-$SKIP_FIND}
SKIP_FIND_DATABASES=${SKIP_FIND_DATABASES-$SKIP_FIND}
# export INSTALL_METHODS_REPOSITORY to indicate your own clone's location
REPOSITORY=${INSTALL_METHODS_REPOSITORY:-/home/douglas/github-sync/local/install-methods}
FIND_OPTS="-maxdepth 4"
# modules
if [[ ! $SKIP_FIND_MODULES ]] ; then
cd /sw
find apps ${FIND_OPTS} -name '*install-README.md' | cpio -pdm $REPOSITORY &
find bioinfo ${FIND_OPTS} -name '*install-README.md' | cpio -pdm $REPOSITORY &
find build ${FIND_OPTS} -name '*install-README.md' | cpio -pdm $REPOSITORY &
find comp ${FIND_OPTS} -name '*install-README.md' | cpio -pdm $REPOSITORY &
find libs ${FIND_OPTS} -name '*install-README.md' | cpio -pdm $REPOSITORY &
find parallel ${FIND_OPTS} -name '*install-README.md' | cpio -pdm $REPOSITORY &
wait
fi
# databases from /sw/data
DATA_REPOSITORY="$REPOSITORY/data"
mkdir -p $DATA_REPOSITORY
if [[ ! $SKIP_FIND_DATABASES ]] ; then
cd /sw/data
find BUSCO_data -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find BioBakery_data ${FIND_OPTS} -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find CSIMicrobes_data ${FIND_OPTS} -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find Centrifuge-indices ${FIND_OPTS} -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find Chromium ${FIND_OPTS} -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find ChEMBL -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find DRAM_data -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find DeepVariant_data -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find Dfam -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find ExAC -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find FAVOR_data -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find GTDB -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find GetOrganelleDB -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find HG002_Q100_T2T_assembly -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find HOMER -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find HaplotypeReferenceConsortium -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find Kraken2_data -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find KrakenUniq_data -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find Kraken_data -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find MMseqs2_data -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find Pfam -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find PhyloPhlAn_data -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find RTG -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find RepeatMasker_data -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find ViWrap -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find WPS-geog -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find alphafold_dataset -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find annovar_data -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find cdd -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find chain_files -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find dbCAN -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find eggNOG_data -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find fastq_screen_data -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find gnomad_data -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find iGenomes -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find ncbi_taxonomy -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find panther -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find piper_references -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find pph2-db -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find reference -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find silva -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find snpEff_data -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find soprano -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find vep -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find zinc15 -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find zinc22 -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
# group 'kgp' directories
#
find HGDP -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find SGDP -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
find KGP -maxdepth 2 -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
### These commented-out databases should be brought in as well
#
# find dbSNP ${FIND_OPTS} -name '*install-README.md' | cpio -pdm $DATA_REPOSITORY
fi
# Custom database updates: first arg is source/destination directory, remaining
# args are files to gather; subdirectory structure is preserved
function data_update() {
local REMOTEDIR=${1}; local LOCALDIR=${1##*/}; shift
local CURDIR=${PWD}
mkdir -p ${LOCALDIR}; cd ${LOCALDIR}; LOCALDIR=${PWD}
cd ${REMOTEDIR}
files=("$@")
rsync -Pa --relative ${files[@]} ${LOCALDIR}/
cd ${CURDIR}
}
# These databases are in /sw/data and update via crontab with an update
# script, OR these are additional scripts harvested from databases examined above.
# Update the repository copy of their READMEs if necessary, scripts and other
# files.
cd $DATA_REPOSITORY
data_update /sw/data/iGenomes tools/aws-iGenomes-download-all.sh tools/build-iGenomes-additions.sh tools/iGenomes-STAR-2.7.x-index.sh tools/iGenomes-genes.bed.sh tools/gtf2bed.pl # additional files, README harvested above
data_update /sw/data/SGDP check_md5s.pl # additional script, README harvested above
data_update /sw/data/Kraken2_data Kraken2-update-db.sh Kraken2-update-nt.sh Kraken2-update-prebuilt.sh
data_update /sw/data/KrakenUniq_data KrakenUniq_data-update-custom-db.sh KrakenUniq_data-update-custom-db_veryfat.sh KrakenUniq_data-update-standard-db.sh KrakenUniq_data-update-standard-db_veryfat.sh
data_update /sw/data/Kraken_data Kraken-update-db.sh
data_update /sw/data/RTG RTG-update-dbs.sh
data_update /sw/data/diamond_databases diamond-update-dbs.sh diamond-check-dbs.sh diamond-update-reference_proteomes.sh diamond-update-dbs.sh.0.9.29 Makefile DIAMOND-DBs.md DIAMOND-DBs.html
data_update /sw/data/FAVOR_data FAVOR_fetch.py FAVOR_api_token.py
data_update /sw/data/MEMEsuite MEMEsuite-update-db.sh
data_update /sw/data/BUSCO_data BUSCO-update-v1-lineage-sets.sh BUSCO-update-v2-lineage-sets.sh BUSCO-update-v4-lineage-sets.sh BUSCO-update-v5-lineage-sets.sh
data_update /sw/data/blast_scripts README.md README-uniprot.md update_blastdb.sh update_blastdb-uniprot.sh uniprot.mk install_check_prepdb_blastdb.sh remove_old_blastdb.sh cron-wrapper.sh crontab.txt test/test_blastdb.sh test/prots.fa test/nucls.fa test/*.out webpage.mk webpage.md webpage.html fixup
data_update /sw/data/ncbi_taxonomy ncbi_taxonomy-update-dbs.sh crontab.txt webpage.html webpage.md webpage.mk
# These databases are in /sw/data and DO NOT update via crontab.
# Update the repository copy of their READMEs, scripts and other files.
cd $DATA_REPOSITORY
data_update /sw/data/CTAT_RESOURCE_LIB CTAT_RESOURCE_LIB-db-README.md CTAT_RESOURCE_LIB-download-db.sh
# This might get moved to crontab later. The prebuilt databases are updated on
# a haphazard basis
data_update /sw/data/KrakenUniq_data KrakenUniq_data-latest_install-README.md KrakenUniq_data-update-custom-db.sh KrakenUniq_data-update-custom-db_veryfat.sh KrakenUniq_data-update-standard-db.sh
# additional resources for the KGP data, in particular: lftp scripts and MD5
# checksum files. The *install-README.md files are gathered already above.
data_update /sw/data/KGP 1000G_2504_high_coverage.lftp 1000G_2504_high_coverage.md5
data_update /sw/data/KGP 1000_genomes_project.lftp 1000_genomes_project.md5
data_update /sw/data/KGP central.md5
data_update /sw/data/KGP impute_haplotypes.md5
data_update /sw/data/KGP regional.md5
# Databases in other locations
DATA_OTHER_REPOSITORY="$REPOSITORY/data_other"
# The BUSCO lineage sets are under the BUSCO module tree and update via crontab
# with an update script. Fetch a copy of their READMEs and the scripts.
cd $DATA_OTHER_REPOSITORY
data_update /sw/bioinfo/snpEff snpEff_custom_database_install-README.md