From c8f351aa0ae64ab43b075af6984032c87f42df2c Mon Sep 17 00:00:00 2001 From: Stephen Date: Sat, 7 Jul 2018 00:00:00 +0000 Subject: [PATCH] Added program documentation --- README.md | 70 ++++++++++++++++++++++++++++++++ demo/hiv1-genomes.zip | Bin 0 -> 12635 bytes demo/hiv1-lanl.yml | 53 ++++++++++++++++++++++++ {samples => demo}/settings.yml | 6 +-- kameris/__init__.py | 2 +- kameris/__main__.py | 3 +- kameris/schemas/file_urls.json | 32 +++++++++++---- kameris/subcommands/classify.py | 2 +- kameris/utils/download_utils.py | 9 +++- 9 files changed, 160 insertions(+), 17 deletions(-) create mode 100644 demo/hiv1-genomes.zip create mode 100644 demo/hiv1-lanl.yml rename {samples => demo}/settings.yml (88%) diff --git a/README.md b/README.md index b419e36..030ffed 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,76 @@

+## Installing + +There are three ways to install this software. Choose whichever one is best for your needs: + +**1. If you already have Python 2.7 or 3.4+ installed (recommended):** + +Run `pip install kameris`. + +**2. If you do not have Python installed or are unable to install software:** + +[Click here](https://github.com/stephensolis/kameris/releases/latest) and download the version corresponding to your operating system. +If you use Linux or macOS, you may need to run `chmod +x "path to downloaded program"`. + +**3. If you are a developer or want to build your own version of Kameris:** + +Clone this repository then run `make install`. + +## Quick demo + +This software is able to train sequence classification models and use them to make predictions. + +Before following these instructions, make sure you've installed the software. +If you followed option **1** above and the command `kameris` doesn't work for you, try using `python -m kameris` instead. +If you followed option **2** above and downloaded an executable, replace `kameris` in the instructions below with the name of the executable you downloaded. + +### Classifying sequences with an existing model + +First, let's classify some HIV-1 sequences. + +1. Start by downloading this zip file containing HIV-1 genomes, and extract it to a folder: https://raw.githubusercontent.com/stephensolis/kameris/master/demo/hiv1-genomes.zip. +2. Run `kameris classify hiv1-mlp "path to extracted files"` + +This will output the top subtype match for each sequence and write all results to a new file `results.json`. + +The `hiv1-mlp` model is able to give class probabilities and a ranked list of predictions, but some models are only able to report the top match. For example, try `kameris classify hiv1-linearsvm "path to extracted files"` + +To see other available models, go to https://github.com/stephensolis/kameris-experiments/tree/master/models. + +### Training a new model + +Now, let's train our own HIV-1 sequence classification models. + +1. Create an empty folder and open a terminal in the folder. +2. Create folders `data` and `output`. +3. Run `kameris run-job https://raw.githubusercontent.com/stephensolis/kameris/master/demo/hiv1-lanl.yml https://raw.githubusercontent.com/stephensolis/kameris/master/demo/settings.yml` + +Depending on your computer's performance and internet speed, it may take 5-10 minutes to run. +This will automatically download the required datasets and train a simpler version of the [hiv1/lanl-whole experiment from kameris-experiments](https://github.com/stephensolis/kameris-experiments). +This was the exact job used to train the models from the previous section, and these are the same models used in the paper ["An open-source k-mer based machine learning tool for fast and accurate subtyping of HIV-1 genomes"](https://www.biorxiv.org/content/early/2018/07/05/362780). + +Now, open `output/hiv1-lanl-whole`. You will notice folders were created for each value of `k`. Within each folder are several files: +- `fasta` contains the FASTA files extracted from the downloaded dataset used for model training and evaluation. +- `metadata.json` contains metadata on the FASTA files used to determine the class for each sequence. +- `cgrs.mm-repr` contains feature vectors for each sequence. See the mentioned paper for more technical details. +- `classification-kmers.json` contains evaluation results after using cross-validation on the dataset. See the mentioned paper for more technical details. +- The `.mm-model` files contain trained models which may be passed to `kameris classify` in order to classify new sequences. **Note** that models trained using Python 2 will not run under Python 3 and vice-versa. +- `log.txt` is a log file containing all the output printed during job execution. +- `rerun-experiment.yml` is a file which may be passed to `kameris run-job` in order to re-run the job and obtain exactly the files found in this directory. + +Kameris also includes functionality to summarize results in easy-to-read tables. Try it by running `kameris summarize output/hiv1-lanl-whole`. + +You can change the settings used to train the model: first download the files [hiv1-lanl.yml](https://raw.githubusercontent.com/stephensolis/kameris/master/demo/hiv1-lanl.yml) and [settings.yml](https://raw.githubusercontent.com/stephensolis/kameris/master/demo/settings.yml). +Training settings are found in `hiv1-lanl.yml` -- try changing the value of `k` or uncommenting different classifier types. +File storage and logging settings are found in `settings.yml`. +After making changes, run `kameris run-job hiv1-lanl.yml settings.yml` to train your model. + +[//]: # (## Documentation) + +## Dependencies + This project uses: - [stephensolis/kameris-backend](https://github.com/stephensolis/kameris-backend) to generate k-mer count vectors and distance matrices diff --git a/demo/hiv1-genomes.zip b/demo/hiv1-genomes.zip new file mode 100644 index 0000000000000000000000000000000000000000..0c7ab89bc89833ff78ddb1155a8d959c356305d3 GIT binary patch literal 12635 zcmZ{rb8zo&xAtpu*S2ljwsvi6*S2lj?XGRx{nobIT^sLyo^$4$-!o_4WF{-Q@8t7G zW|CajS}Te&pkSy#KtNDHMKKSuo_SIr#&AGDSL8rIkbg&o>CFtCoDFk+PR8nLAm^-T zJRp%0Vva#USCk8_5f3H~qh=s91YeB}Nxpi?SC8^*R6_*=lv@9^n+u#bTd7@rmi2x- zJ^Q^pZR~V;dVU@a?(p+>f8Xp)jw<;1eSLj)KF<>Dd_FwoL3GVcN>r(oN!0(Na;%XjW)^S~+~QIDV6mQ#i2K`*TCV#A@Zb*~7Z#5;Ngr=?Bwz z@XWSwf7H_nFdFDN-g+i+`er}=ql-e~*0CLd@@?$bPIFH~-=K~ATPq;uI%=f}8oGVz zQP&c+^spG|e0`v%6=eyb>uZE;s`)@|B6g?#ot@%Hbj9->=E!!;*=K1Jo;4nCC5q;B z#!JqLUx;L-DW?57Yn{RqQh`WB$(qQrhQI>z;>r(`7ndU>uhd{}%U71JPWUXO`B}Ps zf@=LB6JcF9fLukm@1YJ646nS}I3KZXXjkf&o?KK&^J8F+n}Op^vkCbVGLmVl$Rl}k zL+3EW4bC3`V!qag4#Slq?FNz0AkWjuTZ17wbwURnwVMS;+x&HqYgoS}5_jenn3Zit zE-@+E607RKiJ|K$XAl7Wcf6~wB=C5W_z*44t?#;2mRC(v$*y`#1rkC#cSRKqeI(Gaq*R{9B`4TVAR1$ok9ns!lIfzbtif-B_ju_Zm^qZi!HRvrmzQ z@}E*mI~vEfpU{LqXpYabb_C^25!d>(x)O#ofzJgWD2@aLISPaZ_Q5aBiia*Pc3}a_ zWMsiqAu5Tf!cba<1i)8cFjI9TmZ_>|J3Mayu*EV@|02#|im=F9c=Eyk;9 zj{?1$0jk67d|&01Avn3BHIxr}V&ujR{WV@_hVs=Rf{c5&{Hf_+M}zIvCL`i|5;bc$ zSuR=BYYc#N9>;Qk7~^62bAQ^KBoBZM&0FnrWmA=d%))>Bv;8JC%`9{eBqoeevI>Lz z#7>tQ6MQ?MoI;p801H;sVuL1^_2`#{*a7Z34|zloQ+6C2HAg~$8zgt00xxuy1VJ02 zX>z(gif_XqXR(}{K@w9D^3PO$JCKc5>Y>9t{}rh$KH8&PfFauq0u&sNYot#mr{vo& zdl$?#lMFP3*`XYn@4)RotEd95ZHvBkGT|K#7BB&Wg&@@i*|U;)<7a&AoAMN6W`_=e zz!H=+qh+XO^u{!x!AV^cB_XdqnMc-J?W4e5p{0H&pjTskHYHHdwI#)2pnuK1B!i@Hc; zG}2}@%t{2oz(gZrRx8mSrqu=1BqS8ECebM04Wi3z9X`PsIytBi_s(>imS~L-5WnV& z#N`}osLI@+a2FbzR)Po@ZIJ(z!wP}K#8Z%NZW&61+uYSAnRFcAfwEqhd@)ZxS;?@D z*3bTd*gSw2Q-RzZSGqFvup zgj!|oF;K+Jgj$l0t4n_BPbaRhd?UAHp&NBZUldfv+w_f^#za4@6NaoXM(_x|6TS>0 z!yp-S5SpJAFtOh+TxDZ&bQ9^dKo=j>k?zqzm@Ft6rjb#z&|VWGm&RZp#Z*G)CXT3v z%|eGm7SC=+Z&B=&=dUxP20v0ucq+wWJ`3Co5-wk+Wnx)mDGHu7D$dCjSCg28IvU~# zJfwMTAf!xi)<~2hA1~1+!^`u!pjQ>F1sJi{8@<)p8z~AUhS5?s2&ot6$Ea|lTYGTf z3h6!sL6tF0h*Y-&(PL3Vo~liORilcjB)y;k17`q26sgtwMAvLjgQyB^uO)Fkdu23{ z!f}*t3)yY7s;%3Z5Db)ap=>w!KOVOaJl)>hY{?qEoB(}{^bJ|=-NDW2p&E;ZWD%8mFyw8ERP2v5WxqmpW3Lg<`vN0=hKSf|UK_&3u&JVzq?P zAE9tN)8MCCrbKBbacijt{tSvcx=73mfx?0M4&PJ=BCU$EB<*fdG7t%NqpSh6eN@9T z;?2qp$h`afO#yGRMT}(6nCm!Q=`|p=;4qw`VvyS4oE`barpD4b#6<-XzYC|ALSh4v z%ut<`m2@peXFM-=!clp7yXzemu6EQp;VVGVc*C$l=hxU03ry3aRo2yFtog+{kPMMQDhA9%^h8(i0a}?5hT@ zQA}9WvkmHQHxdmx5xn?EW*)yzNA78IcvWxMhw|Os4^;FlY45b-(t&^X%26NLYK=Jm z@vX&;a_7GmzVoIgaNEVl&dgD=BsF6I@e4^GjNjW+s_d@OBjrk0C?sCOka`LTBb#x3 zOWx_Y_Ow|qXdo9;R>9MoxUYMiCb}>1oc&cuhQxU>%MIZdvbzLko1MQn$1dm3tTPkX z-)w>Rgk{J1%Mpe8gk4S0tl~jF@T!3DvLtB2Aj(d=gJ-_tgJ(kDM^G?8R&cODyF~7Q zT&_xcL;lx65z6jJB;9t&HN1@abWYi+iM8>fuNp^uQ{gpw1yKE%{YZJd4|clbU=uxU z!4!*AQ(KgPH(V+X>7rCHR|BS!&PK&LjT*8KERv<0f)Xp(kO&hMh`Kq*5{xq9t-=YM zN0QPlY6De38FVT$O{8qDe`Np;b@5~GZ^{+r@+3esV%M)J1^CwS?d3M8a#cjmt&viO zIADLMU&>&DaJ0It#iN>W%4KSqX-b{Wb0fx>Ztm_+i`}0%Vp;2`l)VK}O11|3GZsb{ zl^b%(Q_)ZY)wE)?B+lKZdxYU1h5|W*-GD#^=Ol3oJFA3?X(sK8RT>tlCO6tdnK3zv*MMgj6WnP{9GiLj&D@Qk1sC zP}-SxNl)Iano;ShIJF}Lf$x3b1_foO=ZI;UarRk9d<5AThdRf215&x~RkTPT@yaZ6 z86XwB5BW;c`8CkvrDqZv#>joET*midQtVwWM!?(UD#3L{Z>{i74s3ex5n>$r1z96( z0sZF2>vrQm`C@F@>ay6Me1f(*CSO@0>5`C`r+g{>ma;Q$ry)x*@cBcyo{Ni0Vzw?9 zw{az|fE2me_#8iUZFnwdTXR&gJj$F?O<&0v1!qWy?b7|g)C=6q>FbX746Ud6wx4fQM!Me&OYz&x!t8GrlM{T=d!x%NE;A^{Z z0WkqQkC(?DiEmd}DcE)>4<@JUm;pSv;68(u+TzB8Ea`j}uKj z1JQU*==ethN_Lu@uAZhOf)P)#(M!^JuJ+|$oYjn~~=lcl*s=H$jl( zG9db2XadapaAW%}axU6%n;WNJNiv`qo`z600sE#GrZUGNuGSC1U$QxzTIs3}j>>m5 z65y2m>O0-Ly8$;~eu3mnQBP>b)6CHWky@kS#I7L-xk&xpDi>{dwZxHYK_aNQBEqz} z+y(^BnQOwqiEpMl9uq)R;%+!~2UuP7vrU)rhD<#nJQKhPWjMzfE9i`ogp}4;po#PD z3SR?{O!s31#HHuKklZC;y;ANI{Zmhp1ucH^$JqJGz6#$Pm+QO^^&;1xqdbe!i&QPABTJI*^qpY(B2Gf zHRr${vIbMVir?PB@#okl0)o+2ZLH|uTLp5nkF6jmeQ-q>*ex&=Z4nUKq@Y>G)IbRr zkWVrW_SKn{87-0#1p2MK$1DsxFzp$%W{EwWlUet3&pgBW62sh7NK?r7Knx^qf{eb+ zS^xy`JeI7EnEWds48e}koG(LifVv8Q%HN0B>uJ{b#X+r$VV_F&V^PlwLhUD6T zB~X>ZSduJt*SjOQD;$rQjNDGI;W|c^NRud47=csyTmy;0l<3D1eRcjw+3dr?Jp|Ii z;KNMZ{qA(%k)q2xtKwf?E83xGA`zm^sByfZm)Y~BR`}F#Q%&_)Vl$jD(OZ54a~`)* zREn)g=jjLwv*{$N*-Tz6Yhpo%Qa3UTToV>m)lEG#@hB}4^Vm?h2$uWnwD z#;Y>%pHOZG==W457VCNF;(ZsI7xmFofvp4fSr1%$*ZMkDpdQoh5U|4bYPGgHv*pIY z?XPu*>$KLQ*GvKkne@`91Ybv#wp?E2thywD*EsLU*#Bt2UgvYtAWU2WEf{%swVeD& zPEkA-JGxtU&d3l-yKijKp|w%uO_D2 zg!og#6r)JUfK(8EP=7ub{2_sj9gfhpafbGj_?8Foi^<5QEL&VuNEgPM||`Ta^NUj zq#BRNuwWh3B+3$RcG_0LDzyMImJ<>hL7UBx>CgS8XfF=aD3TikJDfx)L~r6a<#_Ju zbL2TXNGQZJvA5>Aa)Twz^Ne+wX&(=`Q_Au0BDaJ@QqsV^GWX_#*;vd zJBPuDq$-h_58A zPYu!Iteu?CwGY7?+qseENj(KV#zw!gq#~F$N}7YOzf5Y?am0tEjGGLqmD%Z-Jr(bR zJ7L!KryQw_B#$TulBzNk)x-wmZ=%}tmfO|_Onovem!6X@`@U$D)IAqn8}gTmU(@oP zP&%9P9{ozZY^0<7TUrTDGVP|Vd5j`F8_ehe$~SZJmT8~dxz?T|pGs>FYeA)XTiJpb zlS)cAx0i6OaekT7>W!gok%kOPN9wi1W)z4HELo2(a~Ofyd+eSpCEsw{N!V~rhN)B$ z+TRZWM<4V1KUTLIo93IEbaYhQzL1eu0JaA#Sws|PC41mLYI0iw*W|XprexM{`ZEB> zI$L66wojRRzkjtJSdY-m%5B(+;+!?Kj*iu2tJ~f$MZuHxgh=C0O#O0?Vhb?EyFXd=?E~vA&R?qK-;v3uBBmn0cP`)7*d;N^6 znW@|fR@rVwjZ`cUh z+CE9wBYCjgq)SF-;e4ch5Ww9T~WLK=F z7y#FJaSdbF^qy;RPyx_`&qHi!RrH}HG%IHuoN)&DR~Ofn2-mg;0Vq<>0U6q5H~j$# zJxi|+x_fv>@MDcu$jdE)dYPwj6ALaaDuHcMM_b_|aCbhs@R1}^#2RT;P4~MU;W`wsm z6CppPE8w!@pkJA^-Kl+mSTmEV--W)05z4XC0~9X4g{*dEQ$T9VwyBnyux;GLc7B(d zwDw_RzCMkoUv|gaIH<_OWbQ3HT^U~#=J~13J+WHxxF6&;x`8(TkV`4OSiz+m7YnFW ziGqd3Mw96Q?QK0!w1<~8uK}RJrvbPtA*#KUs64Wg48YF38aA#H7BqnO-y62`F~D}R z{V9mNq4shBZHn5UBFY1?L@OB7{fus;6-%qI zb4{(mcj9+O#SkZz z6c_ch1#hGRTx(C)7p#DIeHgUP(lLAIF1N)}{saIrrNV98G|_6SVwx`+5de|{DqK~o z{-*8rtXsKR&$V|8RQMV}J-63OD7bZE)%N|Av_~7Z2D>U~5gxF8w)2cm@B5W}L~UF>|EN4fQW59<594tM_Xef#(v%@gQ;2k7^_Jf`V?U1i06 ze?E5leZ0Jex!vaF`F`CVJn#5?40rbWeeLb+_<7>Los{|ceC+`uYAS1fyMIldN1vX? zda(HAV%$Hys#TchrOoDDpIl^*QnQ@jQ0`cUo*a4O-9~%rsaiYCbQNeduxVIUdgJ@% z@)Iz>f3g_2E_;WbziN^H%#x84NSeOz#Lq%JsW;C+Yz&Gv=eVUCY+9NVsRLflM#@~s6sGHOz?mgM(Z=i^QpFna(-*lYaUyjx2yvvjiq^i9v`9B zT$0fdr4Wg%CPBeq`yJFXnJQ%_c&6*sr72>5HdQFF8Zqcv_nq+^gB(KN!g4Um4qR?#Z~)>#!PZYG{xzP%Q~$L) zqK9Rjn`nvSVCln{c2|ckI|b{Ui5B2b`$C%#)@ac>0It2_AE0=yonwKV-#x|D>~6Ie zgq7tueZ~cgVnvPRAy_y+ke!`>204AV*!5HOv2b)7?_sC0?o6xaP)}s99)8B|RZf4J z8oLIVWpL8!ZOE}vWfDVTTN57JW0jHc~4;GVR!b`2iQ9}*$`o{(`!230-au$7(|g=+OqZqR!DGy^q@PG~w0 zuD{0ajkpc7EUia2p9|q=bg^h-99FVMl0FTVNk&7Tix*c)1P%*0y1cHT$>rgB{vB2H zb0`<1rXJmY052}G$=?va=N8Kf+9@Gn?qGG}L_L@je_DxuagZb3Vt0GJCn@j;h%5ks zJ*&<$2QtB8s$JV+Zl!I9OQ?n#)a~Z&O7J--fB_E5IE;*+)P_+V_4D8xVaz;f1i$X8 zGpwp!106o1Bj%j2m zoJ9#BC`$6z#aS8+!be)$w84kyu&`q%gdVIHe3=0IzVU#PkxQ={I2N`uj+^9+NQcF} ztJH#>fZecN_vteS!#{VkX0zv<87&+iG}U;D!&&|dW`AhF4gEP8i~!4z0}fDxYVuk? zsFYc{*DJ9LINKbp>PgLWGGGr}(}e^N8*Hmp6Ik%2DN13h(?)s?Roedm$;4UG%M5Vv zCzIwSY50eyz+5P*lIzVnXi}f?>>%16MPdg1s?O4MX`G(Giv(V?jtU!V3|vJ?&@g6E z{cV}J(tKHUb1Em6bZs*Q2T(}}qD59L6wB?NqSl3abF-!45o!K)Y##7;k7TPD=3?ePaArBMus=I9=$_-JVgZ0Q- zs0@T=wPqU4H9WzQrhIUi7SudnkyagS{ACEw%eXy#akM~PLdhPH#Ow)xY?jgjHAp7k zFSkmj)%*%>EH|W$H^asZmkVDyrK2AH86$Mf<+R>>%4}ehK$dY*h-`e-lXCLVEAP;< z+0Yx^4U&TU`D12)TVK%uhL_anPh7*eqsxwF6XgoY*Q;F!S>Uqtg_(s;`FOAH6e1$( zdkOOn@h294t$as^2>%a#;od`3X|3(!tQp}Q)D_Dw z`?D+^6!WgVr|ag@4lExk#c<8V6)*;}ssm!TlgbFakdj=zqRF7XUxu|Ym~ zryGTH0AZB0OhQz$$fPV!aGOGv;?n7E7)F^+NfarbLJz(huUlbzDII2R$42(~IS;p^ ztHKg(-2YiN)(9~c|Obo9%V zwK}ftoXIXV(|7U%O-a{NUBr4#>${0{gqq+kTz?2E8+5b@NFV3WTwl3sWYF-LY4JYu z`}E8SwdQl(@e17pJiIRqXOYnLZsivO$dBZHx!_3V+2XbdONp&tMkowTX@oVmhV5HE zwpeWlr!AD_I#i#K$2#;R7gK%|?OKy*jH1!YVf3eRJp87uZ^02PBdwWmWY0`wqx!O) zeeE7XzMfHvH#pzsFA}FC8jdJg5ti~lY5S~dOYsTMY_%+G9341l^A)}^_(oDx=P8x3 z1eI#{a^gF)d*hI&62biL_QA=%T9V71WovLbK?UK@YL9H~-3~FBk*hw; z?NV!tGzEx@MZfJUha=K<#zw#$2f-{O=YgP*bjT?@3#NW6%}H7-mY-@l88I5Z6RaAqefbkt`*+k6IUyT` zLq@>G-0MDb(r(xe&7ETtof`VqL4YDC4xOz?5|8-TD;FA!AF8${{=0sZh|NM%r{(fBeN3Km+(y5vx$~&LZyz)J2y*vD>rC%#Iu3?;9z%3 z+2qt$b8uxI8+_1Fj4H=_peBX&ItBBOpbOV&uVwIC>xC-462$oPAZO4^(TQU+3xF6- zKwHJ2n{+Er2d&zI3i3-gmNtOd(LGl$+wrQ3+lI>};@hl7Y*pQ2jn)KZYC-YpnvyNB zPa^2Lccf*oPOMH(<+z8U?cEO>v>UeJ%+Z=4=y_Yz=Vo7+G|6wP;>I35TQ*U;*f~HP%FGsd^TSaRp02~`}nbrg$ z_-uDX16b|3VQjx~=Y_1_x|3CeD2?XWf5F~&y``S^Svk2UcqES)KV?WyxTk&;h?2@NlLnh~A!nVp8x1mi4K+bhIlbGLczn(WpzlN=)3p<iZu!RN8>D$4c>w-nn@??>?`7W2d;I55rW{I9 zx|y;L40+oRQ|%Uuc(RA=j`of(9h2*MwpNiO(bpxo_4-`EZFN#6_212sPI97AT5YRQ zi~pg)JK`T?W489__Wo+{x4-%He`|2||4yj?oQyObMc(||*aNu=VQ6v~LzENMgbcz0 z#@v`ayb2|rm~i<0Jnp_|%ew_|df#?BH#s6TJd{1zq0rH{(9E$ z{=B)E)#v|wdc7jp@w`le+`u->dcE8sQS*jK}$V`=}h(>!hM-3uNUi*s{| z-#+c2vGgHiujAv$QM5K;HrACVEXpa#YwzCR635p%N3hCMZYkv>oaR!-v;5*=8*#fU zbdsyHq&}oOb|b7N%uR?ZrM1ZRy8kB!or?SYSI0KZ7kC;bH$hL#mu?jP28T)cU<}-; zXefHg+@G(G7g~v2J-s-NovpGpv{~j3i?*%hnrHZ{cpU4@X8eUg#>nlPAHD?K23M9I zDdblRbJQ>%<1Wp>seFy^r4mnXEHm0$`RP5sW&#YU=fmj!+yIt^-^aCz@Nu57AuFDp zGU_UZ{EZsT+!||Fltc2awRvWbF>M%QA~7=bj1etj%;W>sX2?z(bx$u5$$~E3J%rk; z>dM2#TgE!!k2|wH1P|O`J>gzjaVf83x#px{$v6Rl@_@%I4zINy@xW>G%OmM(!#9vS zVwRFgs)}kxdvOm63Jl$nVx#qm@fov}h$NVCe47yKoX)i7j5rFYhv^*Zepon7TR?Ip z7s3^Wm#ft`~GSVqw&vSn$6GyUYv`xZBkZ@M| zCRJPIyr2*c+1<7LTEG9KcROxB*0z4lfg`V?FXmta(S%ka|9ko*N*%_;$%C#c;ke>W7E8&H5J{!rjVv1 zwU3Oij`1!pMfyJY(}Ar4NCso-gW;xb`ZaP9>f*VyxmSpMAwdQLO0q98cmV-&4%wmU z9@39SRcr>7>_$Zi-_)~Wv=L26lDI-44IALL4pd4sCe@FN{RKAfSn2|59MuA3t`$m> zg6;j3YmP=VyF>Tq0QMvAjb=OYP_b1K?*2&CylQz0l81Cw+nDFn$z~mBij|cjZ2Z0bEKUBn)evuejjGL{8VDOVh4$OZ|C8is!emG=m8 zQ^0gcG%wxN5yE!q$qPL^lsqDvJ}5T5SLw6O93>t1!@|aD)P3NZ!(n9V0jjNUeix%T zB<`PIP63!dL|RsGNeC9tr`9ZGp*q^ zMdhN+%r#{;rq_SeCWno+MWc+gn}|O|ck<}vgBN;v5F+$M5w!D*I#4&4HBmfo|I+4o!nLaSd7^ypMbrF%O^VK$6{1@#Bp#Dt6x{5EMyubZ) zV4gdDHHcy(!5mH51yo$RUff^kMw;w6h2URXPWrqJ`wW!~^KP!l6=HG)TC^o-7DEjq zA#O!+ukMml!>CUv(HDU!f5tFYL2Fv>-t^vvSf!IWTYfi2pW0DZiBzcFO_*?wJDzRw zzZ`GhpQMXakn0}EAJzDqm4LUfbSwJ|3~<-(#8;$P^&yEPTVap+^JPI)L)pnCI7mPk z>|rSu_YxB6ZDi8B?MIIfFvAqd>V&gWB>i^1c?62)NlAtplMgI+Abn(|Ce?{n_1?_| zMkF5S!OoD)EM}XwBu6^!S5G=uqliyP(+CrA%*NnW;6}m~r9_KSONo{6dd>ry(|b{E zb4hm;+zdrMoe}-rK7XG~!8pxNyLJUPLX^_pwxvTVWKgFvlT@--;NI=`yCPsv@%Zyj zlo5S>%C(VK_ z5!^0A#D=A@lOdc0QGyN_;T#%j>q>a==}(P!DXvJN^34ikJYA~6tV$^i-jmf{_BpR4 zBraC&2Dw@9>%y_1_0U|mP`VXB>qDk%3sht09HY#f}NAMi%Z4rfMK69H`bBtlb(GuXDn6DL-d6y!qA$g-xUQXD^2_hd64K_ zc%;T7doR~mQ$~p}8ddF{=&+ZMMjSk<)Sa3*K*F$j^1;TDiqpVUE>;!={|@Xd(@gxn zC%Do(Dn1ISr}Vr4gU<(x`aE3Sm>60@w+j{@Gk?%Y+t|Q5+gXpnYQ-k0T$D^s^!<%uZrS z*rtH+-{mJYQcfes_Bz)d0E9W-U4$Wz{(Q0cVUU6Q2rm0$Pu_uOWC+nv*+svs9U#L^ zUR-tnJLdgHBG^Kxm^LKP8)KM1DAf)(m4w#5a;qR}NCfl)HOH<^Vy7tefIB=mQU)h| zD3NR@M~kb`!u02X;*^awe<{|vdB3VP{urfz3z~vvy+tfO@Kz{V5zHeJl3!a-Ldo_1 zxU}tlI`mDfGYz0kt{%!gX*nCca7)iTd&V+ioaiTIo-~hGJLMW{YS0fPJs^w5Ij-csBDM+ss*U@SvoO zCQcrviJ%)IS<#$hmOKRALlc6|?~>TCZB#Zy1Z5fx3qMCLEb56@h?{>=2j ztbj9{a*?~+Gm9B9qn%ZX0$gm2uF>o%q1F>$%`c)2Hgq+~;>v%$P%K{_WH>%GLE|`2 z^x-0e%|$4N*<*KV+hFoqsZY)ov_o|%i163A(GgHL%vokwhgQiRQiirxGpF-x9TGdO z9dB?sq}VZ|6R4?hEl!jbgkd|{8HSg{9bqB*5)jvm{~W&^j}nND%W{(UTrcw>&tMP> z9RE~sw4MfHyw`}qOhOAJS*T@ts5G3w7oe6HnI~AXZ_;(|G-qagCuPYQmyQhi?J6^? z#cta!XHT>hxmZ%5WXYpDvWc4t(Ietx{>5yFwtHArtnNWg2DZr6cYRo!4ok0vJn`lY zld=hVUz8zSp!HpwfC+BqeJM5$>e#WXmA&MQ>x}7cX4NgVNd^CPEgL(yR>H6tU^TIC&3NR0_Tl%v2auq zPie*2$Ua2>#r{m&hh-J=9oas{>gO84Ywsv+w;tt~yv{PLF{P8|G+ zpP~#f2m#Rlufd=6Z#w>83*o=F|I_Xd^WOu&K$t*tOXZip2p54%!c+ev{uBIvJ9hpF z4*&ZHUZXwv;)eIO2XoulnNB<9k|8;r* literal 0 HcmV?d00001 diff --git a/demo/hiv1-lanl.yml b/demo/hiv1-lanl.yml new file mode 100644 index 0000000..aaaf423 --- /dev/null +++ b/demo/hiv1-lanl.yml @@ -0,0 +1,53 @@ +name: hiv1-lanl-whole + +experiments: + subtype: + expand_options: + k: 5..6 + min_group_pts: 18 + include_recombinants: true + dataset: + archive: hiv1 + archive_folder: lanl-whole + metadata: hiv1-lanl-whole + selection_key: subtype + groups: | + lambda options, metadata: + import collections + counts = collections.Counter(x[options['selection_key']] for x in metadata) + return {v: {'selection_key': options['selection_key'], 'values': [v]} for v in counts if v and counts[v] >= options['min_group_pts']} + +steps: + - type: select + copy_for_options: [k] + pick_group: | + lambda metadata, group_options, options: + return [x for x in metadata if (options['include_recombinants'] or not x['recombinant']) and + x[group_options['selection_key']] in group_options['values']] + + - type: kmers + output_file: cgrs.mm-repr + mode: frequencies + k: from_options + bits_per_element: 16 + + - type: classify + features_file: cgrs.mm-repr + output_file: classification-kmers.json + validation_count: 10 + classifiers: + #- 10-nearest-neighbors + #- nearest-centroid-mean + #- nearest-centroid-median + #- logistic-regression + #- sgd + - linear-svm + #- quadratic-svm + #- cubic-svm + #- decision-tree + #- random-forest + #- adaboost + #- gaussian-naive-bayes + #- lda + #- qda + - multilayer-perceptron diff --git a/samples/settings.yml b/demo/settings.yml similarity index 88% rename from samples/settings.yml rename to demo/settings.yml index 91d932b..0b89704 100644 --- a/samples/settings.yml +++ b/demo/settings.yml @@ -2,11 +2,11 @@ # this is required local_dirs: # the directory containing zipped datasets - archives: /data/archives + archives: data # the directory containing JSON metadata files - metadata: /data/metadata + metadata: data # the directory for storage of job output - output: /data/output + output: output # if desired, specifies an external service to use for logging # this is optional diff --git a/kameris/__init__.py b/kameris/__init__.py index 33d6f14..60df6c4 100644 --- a/kameris/__init__.py +++ b/kameris/__init__.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '0.6.dev1' +__version__ = '1.0.0' diff --git a/kameris/__main__.py b/kameris/__main__.py index 4042fcb..ff62356 100644 --- a/kameris/__main__.py +++ b/kameris/__main__.py @@ -44,7 +44,8 @@ def main(): except Exception as e: log = logging.getLogger('kameris') message = 'an unexpected error occurred: {}: {}'.format( - type(e).__name__, e.message or str(e) + type(e).__name__, + (e.message if hasattr(e, 'message') else '') or str(e) ) if log.handlers: log.error(message) diff --git a/kameris/schemas/file_urls.json b/kameris/schemas/file_urls.json index 05ae68e..c4ff512 100644 --- a/kameris/schemas/file_urls.json +++ b/kameris/schemas/file_urls.json @@ -2,19 +2,33 @@ "$schema": "http://json-schema.org/draft-04/schema#", "type": "object", "properties": { - "metadata": {"$ref": "#/definitions/url_list"}, - "archives": {"$ref": "#/definitions/url_list"}, - "models": {"$ref": "#/definitions/url_list"} + "metadata": { + "type": "object", + "additionalProperties": {"$ref": "#/definitions/url"} + }, + "archives": { + "type": "object", + "additionalProperties": {"$ref": "#/definitions/url"} + }, + "models": { + "type": "object", + "additionalProperties": { + "type": "object", + "properties": { + "python2": {"$ref": "#/definitions/url"}, + "python3": {"$ref": "#/definitions/url"} + }, + "additionalProperties": false, + "required": ["python2", "python3"] + } + } }, "additionalProperties": false, "definitions": { - "url_list": { - "type": "object", - "additionalProperties": { - "type": "string", - "pattern": "http(s)?://.*" - } + "url": { + "type": "string", + "pattern": "http(s)?://.*" } } } diff --git a/kameris/subcommands/classify.py b/kameris/subcommands/classify.py index 72c2b63..6e08a7e 100644 --- a/kameris/subcommands/classify.py +++ b/kameris/subcommands/classify.py @@ -25,7 +25,7 @@ def run(args): model_url = args.model else: model_url = download_utils.url_for_file(args.model + '.mm-model', - args.urls_file, 'model') + args.urls_file, 'models') model_file = download_utils.open_url_cached(model_url, 'rb', args.force_download) diff --git a/kameris/utils/download_utils.py b/kameris/utils/download_utils.py index 2d0a41f..9fcf909 100644 --- a/kameris/utils/download_utils.py +++ b/kameris/utils/download_utils.py @@ -8,6 +8,7 @@ import requests from ruamel.yaml import YAML from six.moves import urllib +import sys from tqdm import tqdm from . import defaults, fs_utils, job_utils @@ -53,7 +54,11 @@ def url_for_file(path, urls_file, filetype): # NOQA (cache line above) )) filename = os.path.splitext(os.path.basename(path))[0] - return urls[filetype][filename] + if filetype == 'models': + python_ver = 'python{}'.format(sys.version_info.major) + return urls[filetype][filename][python_ver] + else: + return urls[filetype][filename] def open_url_cached(url, mode, force_download=False): @@ -63,7 +68,7 @@ def open_url_cached(url, mode, force_download=False): 'cache') fs_utils.mkdir_p(cache_dir) - cache_key = hashlib.md5(url).hexdigest() + cache_key = hashlib.md5(url.encode('utf-8')).hexdigest() cache_filename = os.path.join(cache_dir, cache_key) if not force_download and os.path.exists(cache_filename): log.info("file '%s' already downloaded, using cached version", url)