From c8f351aa0ae64ab43b075af6984032c87f42df2c Mon Sep 17 00:00:00 2001
From: Stephen
Date: Sat, 7 Jul 2018 00:00:00 +0000
Subject: [PATCH] Added program documentation
---
README.md | 70 ++++++++++++++++++++++++++++++++
demo/hiv1-genomes.zip | Bin 0 -> 12635 bytes
demo/hiv1-lanl.yml | 53 ++++++++++++++++++++++++
{samples => demo}/settings.yml | 6 +--
kameris/__init__.py | 2 +-
kameris/__main__.py | 3 +-
kameris/schemas/file_urls.json | 32 +++++++++++----
kameris/subcommands/classify.py | 2 +-
kameris/utils/download_utils.py | 9 +++-
9 files changed, 160 insertions(+), 17 deletions(-)
create mode 100644 demo/hiv1-genomes.zip
create mode 100644 demo/hiv1-lanl.yml
rename {samples => demo}/settings.yml (88%)
diff --git a/README.md b/README.md
index b419e36..030ffed 100644
--- a/README.md
+++ b/README.md
@@ -32,6 +32,76 @@
+## Installing
+
+There are three ways to install this software. Choose whichever one is best for your needs:
+
+**1. If you already have Python 2.7 or 3.4+ installed (recommended):**
+
+Run `pip install kameris`.
+
+**2. If you do not have Python installed or are unable to install software:**
+
+[Click here](https://github.com/stephensolis/kameris/releases/latest) and download the version corresponding to your operating system.
+If you use Linux or macOS, you may need to run `chmod +x "path to downloaded program"`.
+
+**3. If you are a developer or want to build your own version of Kameris:**
+
+Clone this repository then run `make install`.
+
+## Quick demo
+
+This software is able to train sequence classification models and use them to make predictions.
+
+Before following these instructions, make sure you've installed the software.
+If you followed option **1** above and the command `kameris` doesn't work for you, try using `python -m kameris` instead.
+If you followed option **2** above and downloaded an executable, replace `kameris` in the instructions below with the name of the executable you downloaded.
+
+### Classifying sequences with an existing model
+
+First, let's classify some HIV-1 sequences.
+
+1. Start by downloading this zip file containing HIV-1 genomes, and extract it to a folder: https://raw.githubusercontent.com/stephensolis/kameris/master/demo/hiv1-genomes.zip.
+2. Run `kameris classify hiv1-mlp "path to extracted files"`
+
+This will output the top subtype match for each sequence and write all results to a new file `results.json`.
+
+The `hiv1-mlp` model is able to give class probabilities and a ranked list of predictions, but some models are only able to report the top match. For example, try `kameris classify hiv1-linearsvm "path to extracted files"`
+
+To see other available models, go to https://github.com/stephensolis/kameris-experiments/tree/master/models.
+
+### Training a new model
+
+Now, let's train our own HIV-1 sequence classification models.
+
+1. Create an empty folder and open a terminal in the folder.
+2. Create folders `data` and `output`.
+3. Run `kameris run-job https://raw.githubusercontent.com/stephensolis/kameris/master/demo/hiv1-lanl.yml https://raw.githubusercontent.com/stephensolis/kameris/master/demo/settings.yml`
+
+Depending on your computer's performance and internet speed, it may take 5-10 minutes to run.
+This will automatically download the required datasets and train a simpler version of the [hiv1/lanl-whole experiment from kameris-experiments](https://github.com/stephensolis/kameris-experiments).
+This was the exact job used to train the models from the previous section, and these are the same models used in the paper ["An open-source k-mer based machine learning tool for fast and accurate subtyping of HIV-1 genomes"](https://www.biorxiv.org/content/early/2018/07/05/362780).
+
+Now, open `output/hiv1-lanl-whole`. You will notice folders were created for each value of `k`. Within each folder are several files:
+- `fasta` contains the FASTA files extracted from the downloaded dataset used for model training and evaluation.
+- `metadata.json` contains metadata on the FASTA files used to determine the class for each sequence.
+- `cgrs.mm-repr` contains feature vectors for each sequence. See the mentioned paper for more technical details.
+- `classification-kmers.json` contains evaluation results after using cross-validation on the dataset. See the mentioned paper for more technical details.
+- The `.mm-model` files contain trained models which may be passed to `kameris classify` in order to classify new sequences. **Note** that models trained using Python 2 will not run under Python 3 and vice-versa.
+- `log.txt` is a log file containing all the output printed during job execution.
+- `rerun-experiment.yml` is a file which may be passed to `kameris run-job` in order to re-run the job and obtain exactly the files found in this directory.
+
+Kameris also includes functionality to summarize results in easy-to-read tables. Try it by running `kameris summarize output/hiv1-lanl-whole`.
+
+You can change the settings used to train the model: first download the files [hiv1-lanl.yml](https://raw.githubusercontent.com/stephensolis/kameris/master/demo/hiv1-lanl.yml) and [settings.yml](https://raw.githubusercontent.com/stephensolis/kameris/master/demo/settings.yml).
+Training settings are found in `hiv1-lanl.yml` -- try changing the value of `k` or uncommenting different classifier types.
+File storage and logging settings are found in `settings.yml`.
+After making changes, run `kameris run-job hiv1-lanl.yml settings.yml` to train your model.
+
+[//]: # (## Documentation)
+
+## Dependencies
+
This project uses:
- [stephensolis/kameris-backend](https://github.com/stephensolis/kameris-backend) to generate k-mer count vectors and distance matrices
diff --git a/demo/hiv1-genomes.zip b/demo/hiv1-genomes.zip
new file mode 100644
index 0000000000000000000000000000000000000000..0c7ab89bc89833ff78ddb1155a8d959c356305d3
GIT binary patch
literal 12635
zcmZ{rb8zo&xAtpu*S2ljwsvi6*S2lj?XGRx{nobIT^sLyo^$4$-!o_4WF{-Q@8t7G
zW|CajS}Te&pkSy#KtNDHMKKSuo_SIr#&AGDSL8rIkbg&o>CFtCoDFk+PR8nLAm^-T
zJRp%0Vva#USCk8_5f3H~qh=s91YeB}Nxpi?SC8^*R6_*=lv@9^n+u#bTd7@rmi2x-
zJ^Q^pZR~V;dVU@a?(p+>f8Xp)jw<;1eSLj)KF<>Dd_FwoL3GVcN>r(oN!0(Na;%XjW)^S~+~QIDV6mQ#i2K`*TCV#A@Zb*~7Z#5;Ngr=?Bwz
z@XWSwf7H_nFdFDN-g+i+`er}=ql-e~*0CLd@@?$bPIFH~-=K~ATPq;uI%=f}8oGVz
zQP&c+^spG|e0`v%6=eyb>uZE;s`)@|B6g?#ot@%Hbj9->=E!!;*=K1Jo;4nCC5q;B
z#!JqLUx;L-DW?57Yn{RqQh`WB$(qQrhQI>z;>r(`7ndU>uhd{}%U71JPWUXO`B}Ps
zf@=LB6JcF9fLukm@1YJ646nS}I3KZXXjkf&o?KK&^J8F+n}Op^vkCbVGLmVl$Rl}k
zL+3EW4bC3`V!qag4#Slq?FNz0AkWjuTZ17wbwURnwVMS;+x&HqYgoS}5_jenn3Zit
zE-@+E607RKiJ|K$XAl7Wcf6~wB=C5W_z*44t?#;2mRC(v$*y`#1rkC#cSRKqeI(Gaq*R{9B`4TVAR1$ok9ns!lIfzbtif-B_ju_Zm^qZi!HRvrmzQ
z@}E*mI~vEfpU{LqXpYabb_C^25!d>(x)O#ofzJgWD2@aLISPaZ_Q5aBiia*Pc3}a_
zWMsiqAu5Tf!cba<1i)8cFjI9TmZ_>|J3Mayu*EV@|02#|im=F9c=Eyk;9
zj{?1$0jk67d|&01Avn3BHIxr}V&ujR{WV@_hVs=Rf{c5&{Hf_+M}zIvCL`i|5;bc$
zSuR=BYYc#N9>;Qk7~^62bAQ^KBoBZM&0FnrWmA=d%))>Bv;8JC%`9{eBqoeevI>Lz
z#7>tQ6MQ?MoI;p801H;sVuL1^_2`#{*a7Z34|zloQ+6C2HAg~$8zgt00xxuy1VJ02
zX>z(gif_XqXR(}{K@w9D^3PO$JCKc5>Y>9t{}rh$KH8&PfFauq0u&sNYot#mr{vo&
zdl$?#lMFP3*`XYn@4)RotEd95ZHvBkGT|K#7BB&Wg&@@i*|U;)<7a&AoAMN6W`_=e
zz!H=+qh+XO^u{!x!AV^cB_XdqnMc-J?W4e5p{0H&pjTskHYHHdwI#)2pnuK1B!i@Hc;
zG}2}@%t{2oz(gZrRx8mSrqu=1BqS8ECebM04Wi3z9X`PsIytBi_s(>imS~L-5WnV&
z#N`}osLI@+a2FbzR)Po@ZIJ(z!wP}K#8Z%NZW&61+uYSAnRFcAfwEqhd@)ZxS;?@D
z*3bTd*gSw2Q-RzZSGqFvup
zgj!|oF;K+Jgj$l0t4n_BPbaRhd?UAHp&NBZUldfv+w_f^#za4@6NaoXM(_x|6TS>0
z!yp-S5SpJAFtOh+TxDZ&bQ9^dKo=j>k?zqzm@Ft6rjb#z&|VWGm&RZp#Z*G)CXT3v
z%|eGm7SC=+Z&B=&=dUxP20v0ucq+wWJ`3Co5-wk+Wnx)mDGHu7D$dCjSCg28IvU~#
zJfwMTAf!xi)<~2hA1~1+!^`u!pjQ>F1sJi{8@<)p8z~AUhS5?s2&ot6$Ea|lTYGTf
z3h6!sL6tF0h*Y-&(PL3Vo~liORilcjB)y;k17`q26sgtwMAvLjgQyB^uO)Fkdu23{
z!f}*t3)yY7s;%3Z5Db)ap=>w!KOVOaJl)>hY{?qEoB(}{^bJ|=-NDW2p&E;ZWD%8mFyw8ERP2v5WxqmpW3Lg<`vN0=hKSf|UK_&3u&JVzq?P
zAE9tN)8MCCrbKBbacijt{tSvcx=73mfx?0M4&PJ=BCU$EB<*fdG7t%NqpSh6eN@9T
z;?2qp$h`afO#yGRMT}(6nCm!Q=`|p=;4qw`VvyS4oE`barpD4b#6<-XzYC|ALSh4v
z%ut<`m2@peXFM-=!clp7yXzemu6EQp;VVGVc*C$l=hxU03ry3aRo2yFtog+{kPMMQDhA9%^h8(i0a}?5hT@
zQA}9WvkmHQHxdmx5xn?EW*)yzNA78IcvWxMhw|Os4^;FlY45b-(t&^X%26NLYK=Jm
z@vX&;a_7GmzVoIgaNEVl&dgD=BsF6I@e4^GjNjW+s_d@OBjrk0C?sCOka`LTBb#x3
zOWx_Y_Ow|qXdo9;R>9MoxUYMiCb}>1oc&cuhQxU>%MIZdvbzLko1MQn$1dm3tTPkX
z-)w>Rgk{J1%Mpe8gk4S0tl~jF@T!3DvLtB2Aj(d=gJ-_tgJ(kDM^G?8R&cODyF~7Q
zT&_xcL;lx65z6jJB;9t&HN1@abWYi+iM8>fuNp^uQ{gpw1yKE%{YZJd4|clbU=uxU
z!4!*AQ(KgPH(V+X>7rCHR|BS!&PK&LjT*8KERv<0f)Xp(kO&hMh`Kq*5{xq9t-=YM
zN0QPlY6De38FVT$O{8qDe`Np;b@5~GZ^{+r@+3esV%M)J1^CwS?d3M8a#cjmt&viO
zIADLMU&>&DaJ0It#iN>W%4KSqX-b{Wb0fx>Ztm_+i`}0%Vp;2`l)VK}O11|3GZsb{
zl^b%(Q_)ZY)wE)?B+lKZdxYU1h5|W*-GD#^=Ol3oJFA3?X(sK8RT>tlCO6tdnK3zv*MMgj6WnP{9GiLj&D@Qk1sC
zP}-SxNl)Iano;ShIJF}Lf$x3b1_foO=ZI;UarRk9d<5AThdRf215&x~RkTPT@yaZ6
z86XwB5BW;c`8CkvrDqZv#>joET*midQtVwWM!?(UD#3L{Z>{i74s3ex5n>$r1z96(
z0sZF2>vrQm`C@F@>ay6Me1f(*CSO@0>5`C`r+g{>ma;Q$ry)x*@cBcyo{Ni0Vzw?9
zw{az|fE2me_#8iUZFnwdTXR&gJj$F?O<&0v1!qWy?b7|g)C=6q>FbX746Ud6wx4fQM!Me&OYz&x!t8GrlM{T=d!x%NE;A^{Z
z0WkqQkC(?DiEmd}DcE)>4<@JUm;pSv;68(u+TzB8Ea`j}uKj
z1JQU*==ethN_Lu@uAZhOf)P)#(M!^JuJ+|$oYjn~~=lcl*s=H$jl(
zG9db2XadapaAW%}axU6%n;WNJNiv`qo`z600sE#GrZUGNuGSC1U$QxzTIs3}j>>m5
z65y2m>O0-Ly8$;~eu3mnQBP>b)6CHWky@kS#I7L-xk&xpDi>{dwZxHYK_aNQBEqz}
z+y(^BnQOwqiEpMl9uq)R;%+!~2UuP7vrU)rhD<#nJQKhPWjMzfE9i`ogp}4;po#PD
z3SR?{O!s31#HHuKklZC;y;ANI{Zmhp1ucH^$JqJGz6#$Pm+QO^^&;1xqdbe!i&QPABTJI*^qpY(B2Gf
zHRr${vIbMVir?PB@#okl0)o+2ZLH|uTLp5nkF6jmeQ-q>*ex&=Z4nUKq@Y>G)IbRr
zkWVrW_SKn{87-0#1p2MK$1DsxFzp$%W{EwWlUet3&pgBW62sh7NK?r7Knx^qf{eb+
zS^xy`JeI7EnEWds48e}koG(LifVv8Q%HN0B>uJ{b#X+r$VV_F&V^PlwLhUD6T
zB~X>ZSduJt*SjOQD;$rQjNDGI;W|c^NRud47=csyTmy;0l<3D1eRcjw+3dr?Jp|Ii
z;KNMZ{qA(%k)q2xtKwf?E83xGA`zm^sByfZm)Y~BR`}F#Q%&_)Vl$jD(OZ54a~`)*
zREn)g=jjLwv*{$N*-Tz6Yhpo%Qa3UTToV>m)lEG#@hB}4^Vm?h2$uWnwD
z#;Y>%pHOZG==W457VCNF;(ZsI7xmFofvp4fSr1%$*ZMkDpdQoh5U|4bYPGgHv*pIY
z?XPu*>$KLQ*GvKkne@`91Ybv#wp?E2thywD*EsLU*#Bt2UgvYtAWU2WEf{%swVeD&
zPEkA-JGxtU&d3l-yKijKp|w%uO_D2
zg!og#6r)JUfK(8EP=7ub{2_sj9gfhpafbGj_?8Foi^<5QEL&VuNEgPM||`Ta^NUj
zq#BRNuwWh3B+3$RcG_0LDzyMImJ<>hL7UBx>CgS8XfF=aD3TikJDfx)L~r6a<#_Ju
zbL2TXNGQZJvA5>Aa)Twz^Ne+wX&(=`Q_Au0BDaJ@QqsV^GWX_#*;vd
zJBPuDq$-h_58A
zPYu!Iteu?CwGY7?+qseENj(KV#zw!gq#~F$N}7YOzf5Y?am0tEjGGLqmD%Z-Jr(bR
zJ7L!KryQw_B#$TulBzNk)x-wmZ=%}tmfO|_Onovem!6X@`@U$D)IAqn8}gTmU(@oP
zP&%9P9{ozZY^0<7TUrTDGVP|Vd5j`F8_ehe$~SZJmT8~dxz?T|pGs>FYeA)XTiJpb
zlS)cAx0i6OaekT7>W!gok%kOPN9wi1W)z4HELo2(a~Ofyd+eSpCEsw{N!V~rhN)B$
z+TRZWM<4V1KUTLIo93IEbaYhQzL1eu0JaA#Sws|PC41mLYI0iw*W|XprexM{`ZEB>
zI$L66wojRRzkjtJSdY-m%5B(+;+!?Kj*iu2tJ~f$MZuHxgh=C0O#O0?Vhb?EyFXd=?E~vA&R?qK-;v3uBBmn0cP`)7*d;N^6
znW@|fR@rVwjZ`cUh
z+CE9wBYCjgq)SF-;e4ch5Ww9T~WLK=F
z7y#FJaSdbF^qy;RPyx_`&qHi!RrH}HG%IHuoN)&DR~Ofn2-mg;0Vq<>0U6q5H~j$#
zJxi|+x_fv>@MDcu$jdE)dYPwj6ALaaDuHcMM_b_|aCbhs@R1}^#2RT;P4~MU;W`wsm
z6CppPE8w!@pkJA^-Kl+mSTmEV--W)05z4XC0~9X4g{*dEQ$T9VwyBnyux;GLc7B(d
zwDw_RzCMkoUv|gaIH<_OWbQ3HT^U~#=J~13J+WHxxF6&;x`8(TkV`4OSiz+m7YnFW
ziGqd3Mw96Q?QK0!w1<~8uK}RJrvbPtA*#KUs64Wg48YF38aA#H7BqnO-y62`F~D}R
z{V9mNq4shBZHn5UBFY1?L@OB7{fus;6-%qI
zb4{(mcj9+O#SkZz
z6c_ch1#hGRTx(C)7p#DIeHgUP(lLAIF1N)}{saIrrNV98G|_6SVwx`+5de|{DqK~o
z{-*8rtXsKR&$V|8RQMV}J-63OD7bZE)%N|Av_~7Z2D>U~5gxF8w)2cm@B5W}L~UF>|EN4fQW59<594tM_Xef#(v%@gQ;2k7^_Jf`V?U1i06
ze?E5leZ0Jex!vaF`F`CVJn#5?40rbWeeLb+_<7>Los{|ceC+`uYAS1fyMIldN1vX?
zda(HAV%$Hys#TchrOoDDpIl^*QnQ@jQ0`cUo*a4O-9~%rsaiYCbQNeduxVIUdgJ@%
z@)Iz>f3g_2E_;WbziN^H%#x84NSeOz#Lq%JsW;C+Yz&Gv=eVUCY+9NVsRLflM#@~s6sGHOz?mgM(Z=i^QpFna(-*lYaUyjx2yvvjiq^i9v`9B
zT$0fdr4Wg%CPBeq`yJFXnJQ%_c&6*sr72>5HdQFF8Zqcv_nq+^gB(KN!g4Um4qR?#Z~)>#!PZYG{xzP%Q~$L)
zqK9Rjn`nvSVCln{c2|ckI|b{Ui5B2b`$C%#)@ac>0It2_AE0=yonwKV-#x|D>~6Ie
zgq7tueZ~cgVnvPRAy_y+ke!`>204AV*!5HOv2b)7?_sC0?o6xaP)}s99)8B|RZf4J
z8oLIVWpL8!ZOE}vWfDVTTN57JW0jHc~4;GVR!b`2iQ9}*$`o{(`!230-au$7(|g=+OqZqR!DGy^q@PG~w0
zuD{0ajkpc7EUia2p9|q=bg^h-99FVMl0FTVNk&7Tix*c)1P%*0y1cHT$>rgB{vB2H
zb0`<1rXJmY052}G$=?va=N8Kf+9@Gn?qGG}L_L@je_DxuagZb3Vt0GJCn@j;h%5ks
zJ*&<$2QtB8s$JV+Zl!I9OQ?n#)a~Z&O7J--fB_E5IE;*+)P_+V_4D8xVaz;f1i$X8
zGpwp!106o1Bj%j2m
zoJ9#BC`$6z#aS8+!be)$w84kyu&`q%gdVIHe3=0IzVU#PkxQ={I2N`uj+^9+NQcF}
ztJH#>fZecN_vteS!#{VkX0zv<87&+iG}U;D!&&|dW`AhF4gEP8i~!4z0}fDxYVuk?
zsFYc{*DJ9LINKbp>PgLWGGGr}(}e^N8*Hmp6Ik%2DN13h(?)s?Roedm$;4UG%M5Vv
zCzIwSY50eyz+5P*lIzVnXi}f?>>%16MPdg1s?O4MX`G(Giv(V?jtU!V3|vJ?&@g6E
z{cV}J(tKHUb1Em6bZs*Q2T(}}qD59L6wB?NqSl3abF-!45o!K)Y##7;k7TPD=3?ePaArBMus=I9=$_-JVgZ0Q-
zs0@T=wPqU4H9WzQrhIUi7SudnkyagS{ACEw%eXy#akM~PLdhPH#Ow)xY?jgjHAp7k
zFSkmj)%*%>EH|W$H^asZmkVDyrK2AH86$Mf<+R>>%4}ehK$dY*h-`e-lXCLVEAP;<
z+0Yx^4U&TU`D12)TVK%uhL_anPh7*eqsxwF6XgoY*Q;F!S>Uqtg_(s;`FOAH6e1$(
zdkOOn@h294t$as^2>%a#;od`3X|3(!tQp}Q)D_Dw
z`?D+^6!WgVr|ag@4lExk#c<8V6)*;}ssm!TlgbFakdj=zqRF7XUxu|Ym~
zryGTH0AZB0OhQz$$fPV!aGOGv;?n7E7)F^+NfarbLJz(huUlbzDII2R$42(~IS;p^
ztHKg(-2YiN)(9~c|Obo9%V
zwK}ftoXIXV(|7U%O-a{NUBr4#>${0{gqq+kTz?2E8+5b@NFV3WTwl3sWYF-LY4JYu
z`}E8SwdQl(@e17pJiIRqXOYnLZsivO$dBZHx!_3V+2XbdONp&tMkowTX@oVmhV5HE
zwpeWlr!AD_I#i#K$2#;R7gK%|?OKy*jH1!YVf3eRJp87uZ^02PBdwWmWY0`wqx!O)
zeeE7XzMfHvH#pzsFA}FC8jdJg5ti~lY5S~dOYsTMY_%+G9341l^A)}^_(oDx=P8x3
z1eI#{a^gF)d*hI&62biL_QA=%T9V71WovLbK?UK@YL9H~-3~FBk*hw;
z?NV!tGzEx@MZfJUha=K<#zw#$2f-{O=YgP*bjT?@3#NW6%}H7-mY-@l88I5Z6RaAqefbkt`*+k6IUyT`
zLq@>G-0MDb(r(xe&7ETtof`VqL4YDC4xOz?5|8-TD;FA!AF8${{=0sZh|NM%r{(fBeN3Km+(y5vx$~&LZyz)J2y*vD>rC%#Iu3?;9z%3
z+2qt$b8uxI8+_1Fj4H=_peBX&ItBBOpbOV&uVwIC>xC-462$oPAZO4^(TQU+3xF6-
zKwHJ2n{+Er2d&zI3i3-gmNtOd(LGl$+wrQ3+lI>};@hl7Y*pQ2jn)KZYC-YpnvyNB
zPa^2Lccf*oPOMH(<+z8U?cEO>v>UeJ%+Z=4=y_Yz=Vo7+G|6wP;>I35TQ*U;*f~HP%FGsd^TSaRp02~`}nbrg$
z_-uDX16b|3VQjx~=Y_1_x|3CeD2?XWf5F~&y``S^Svk2UcqES)KV?WyxTk&;h?2@NlLnh~A!nVp8x1mi4K+bhIlbGLczn(WpzlN=)3p<iZu!RN8>D$4c>w-nn@??>?`7W2d;I55rW{I9
zx|y;L40+oRQ|%Uuc(RA=j`of(9h2*MwpNiO(bpxo_4-`EZFN#6_212sPI97AT5YRQ
zi~pg)JK`T?W489__Wo+{x4-%He`|2||4yj?oQyObMc(||*aNu=VQ6v~LzENMgbcz0
z#@v`ayb2|rm~i<0Jnp_|%ew_|df#?BH#s6TJd{1zq0rH{(9E$
z{=B)E)#v|wdc7jp@w`le+`u->dcE8sQS*jK}$V`=}h(>!hM-3uNUi*s{|
z-#+c2vGgHiujAv$QM5K;HrACVEXpa#YwzCR635p%N3hCMZYkv>oaR!-v;5*=8*#fU
zbdsyHq&}oOb|b7N%uR?ZrM1ZRy8kB!or?SYSI0KZ7kC;bH$hL#mu?jP28T)cU<}-;
zXefHg+@G(G7g~v2J-s-NovpGpv{~j3i?*%hnrHZ{cpU4@X8eUg#>nlPAHD?K23M9I
zDdblRbJQ>%<1Wp>seFy^r4mnXEHm0$`RP5sWYU=fmj!+yIt^-^aCz@Nu57AuFDp
zGU_UZ{EZsT+!||Fltc2awRvWbF>M%QA~7=bj1etj%;W>sX2?z(bx$u5$$~E3J%rk;
z>dM2#TgE!!k2|wH1P|O`J>gzjaVf83x#px{$v6Rl@_@%I4zINy@xW>G%OmM(!#9vS
zVwRFgs)}kxdvOm63Jl$nVx#qm@fov}h$NVCe47yKoX)i7j5rFYhv^*Zepon7TR?Ip
z7s3^Wm#ft`~GSVqw&vSn$6GyUYv`xZBkZ@M|
zCRJPIyr2*c+1<7LTEG9KcROxB*0z4lfg`V?FXmta(S%ka|9ko*N*%_;$%C#c;ke>W7E8&H5J{!rjVv1
zwU3Oij`1!pMfyJY(}Ar4NCso-gW;xb`ZaP9>f*VyxmSpMAwdQLO0q98cmV-&4%wmU
z9@39SRcr>7>_$Zi-_)~Wv=L26lDI-44IALL4pd4sCe@FN{RKAfSn2|59MuA3t`$m>
zg6;j3YmP=VyF>Tq0QMvAjb=OYP_b1K?*2&CylQz0l81Cw+nDFn$z~mBij|cjZ2Z0bEKUBn)evuejjGL{8VDOVh4$OZ|C8is!emG=m8
zQ^0gcG%wxN5yE!q$qPL^lsqDvJ}5T5SLw6O93>t1!@|aD)P3NZ!(n9V0jjNUeix%T
zB<`PIP63!dL|RsGNeC9tr`9ZGp*q^
zMdhN+%r#{;rq_SeCWno+MWc+gn}|O|ck<}vgBN;v5F+$M5w!D*I#4&4HBmfo|I+4o!nLaSd7^ypMbrF%O^VK$6{1@#Bp#Dt6x{5EMyubZ)
zV4gdDHHcy(!5mH51yo$RUff^kMw;w6h2URXPWrqJ`wW!~^KP!l6=HG)TC^o-7DEjq
zA#O!+ukMml!>CUv(HDU!f5tFYL2Fv>-t^vvSf!IWTYfi2pW0DZiBzcFO_*?wJDzRw
zzZ`GhpQMXakn0}EAJzDqm4LUfbSwJ|3~<-(#8;$P^&yEPTVap+^JPI)L)pnCI7mPk
z>|rSu_YxB6ZDi8B?MIIfFvAqd>V&gWB>i^1c?62)NlAtplMgI+Abn(|Ce?{n_1?_|
zMkF5S!OoD)EM}XwBu6^!S5G=uqliyP(+CrA%*NnW;6}m~r9_KSONo{6dd>ry(|b{E
zb4hm;+zdrMoe}-rK7XG~!8pxNyLJUPLX^_pwxvTVWKgFvlT@--;NI=`yCPsv@%Zyj
zlo5S>%C(VK_
z5!^0A#D=A@lOdc0QGyN_;T#%j>q>a==}(P!DXvJN^34ikJYA~6tV$^i-jmf{_BpR4
zBraC&2Dw@9>%y_1_0U|mP`VXB>qDk%3sht09HY#f}NAMi%Z4rfMK69H`bBtlb(GuXDn6DL-d6y!qA$g-xUQXD^2_hd64K_
zc%;T7doR~mQ$~p}8ddF{=&+ZMMjSk<)Sa3*K*F$j^1;TDiqpVUE>;!={|@Xd(@gxn
zC%Do(Dn1ISr}Vr4gU<(x`aE3Sm>60@w+j{@Gk?%Y+t|Q5+gXpnYQ-k0T$D^s^!<%uZrS
z*rtH+-{mJYQcfes_Bz)d0E9W-U4$Wz{(Q0cVUU6Q2rm0$Pu_uOWC+nv*+svs9U#L^
zUR-tnJLdgHBG^Kxm^LKP8)KM1DAf)(m4w#5a;qR}NCfl)HOH<^Vy7tefIB=mQU)h|
zD3NR@M~kb`!u02X;*^awe<{|vdB3VP{urfz3z~vvy+tfO@Kz{V5zHeJl3!a-Ldo_1
zxU}tlI`mDfGYz0kt{%!gX*nCca7)iTd&V+ioaiTIo-~hGJLMW{YS0fPJs^w5Ij-csBDM+ss*U@SvoO
zCQcrviJ%)IS<#$hmOKRALlc6|?~>TCZB#Zy1Z5fx3qMCLEb56@h?{>=2j
ztbj9{a*?~+Gm9B9qn%ZX0$gm2uF>o%q1F>$%`c)2Hgq+~;>v%$P%K{_WH>%GLE|`2
z^x-0e%|$4N*<*KV+hFoqsZY)ov_o|%i163A(GgHL%vokwhgQiRQiirxGpF-x9TGdO
z9dB?sq}VZ|6R4?hEl!jbgkd|{8HSg{9bqB*5)jvm{~W&^j}nND%W{(UTrcw>&tMP>
z9RE~sw4MfHyw`}qOhOAJS*T@ts5G3w7oe6HnI~AXZ_;(|G-qagCuPYQmyQhi?J6^?
z#cta!XHT>hxmZ%5WXYpDvWc4t(Ietx{>5yFwtHArtnNWg2DZr6cYRo!4ok0vJn`lY
zld=hVUz8zSp!HpwfC+BqeJM5$>e#WXmA&MQ>x}7cX4NgVNd^CPEgL(yR>H6tU^TIC&3NR0_Tl%v2auq
zPie*2$Ua2>#r{m&hh-J=9oas{>gO84Ywsv+w;tt~yv{PLF{P8|G+
zpP~#f2m#Rlufd=6Z#w>83*o=F|I_Xd^WOu&K$t*tOXZip2p54%!c+ev{uBIvJ9hpF
z4*&ZHUZXwv;)eIO2XoulnNB<9k|8;r*
literal 0
HcmV?d00001
diff --git a/demo/hiv1-lanl.yml b/demo/hiv1-lanl.yml
new file mode 100644
index 0000000..aaaf423
--- /dev/null
+++ b/demo/hiv1-lanl.yml
@@ -0,0 +1,53 @@
+name: hiv1-lanl-whole
+
+experiments:
+ subtype:
+ expand_options:
+ k: 5..6
+ min_group_pts: 18
+ include_recombinants: true
+ dataset:
+ archive: hiv1
+ archive_folder: lanl-whole
+ metadata: hiv1-lanl-whole
+ selection_key: subtype
+ groups: |
+ lambda options, metadata:
+ import collections
+ counts = collections.Counter(x[options['selection_key']] for x in metadata)
+ return {v: {'selection_key': options['selection_key'], 'values': [v]} for v in counts if v and counts[v] >= options['min_group_pts']}
+
+steps:
+ - type: select
+ copy_for_options: [k]
+ pick_group: |
+ lambda metadata, group_options, options:
+ return [x for x in metadata if (options['include_recombinants'] or not x['recombinant']) and
+ x[group_options['selection_key']] in group_options['values']]
+
+ - type: kmers
+ output_file: cgrs.mm-repr
+ mode: frequencies
+ k: from_options
+ bits_per_element: 16
+
+ - type: classify
+ features_file: cgrs.mm-repr
+ output_file: classification-kmers.json
+ validation_count: 10
+ classifiers:
+ #- 10-nearest-neighbors
+ #- nearest-centroid-mean
+ #- nearest-centroid-median
+ #- logistic-regression
+ #- sgd
+ - linear-svm
+ #- quadratic-svm
+ #- cubic-svm
+ #- decision-tree
+ #- random-forest
+ #- adaboost
+ #- gaussian-naive-bayes
+ #- lda
+ #- qda
+ - multilayer-perceptron
diff --git a/samples/settings.yml b/demo/settings.yml
similarity index 88%
rename from samples/settings.yml
rename to demo/settings.yml
index 91d932b..0b89704 100644
--- a/samples/settings.yml
+++ b/demo/settings.yml
@@ -2,11 +2,11 @@
# this is required
local_dirs:
# the directory containing zipped datasets
- archives: /data/archives
+ archives: data
# the directory containing JSON metadata files
- metadata: /data/metadata
+ metadata: data
# the directory for storage of job output
- output: /data/output
+ output: output
# if desired, specifies an external service to use for logging
# this is optional
diff --git a/kameris/__init__.py b/kameris/__init__.py
index 33d6f14..60df6c4 100644
--- a/kameris/__init__.py
+++ b/kameris/__init__.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '0.6.dev1'
+__version__ = '1.0.0'
diff --git a/kameris/__main__.py b/kameris/__main__.py
index 4042fcb..ff62356 100644
--- a/kameris/__main__.py
+++ b/kameris/__main__.py
@@ -44,7 +44,8 @@ def main():
except Exception as e:
log = logging.getLogger('kameris')
message = 'an unexpected error occurred: {}: {}'.format(
- type(e).__name__, e.message or str(e)
+ type(e).__name__,
+ (e.message if hasattr(e, 'message') else '') or str(e)
)
if log.handlers:
log.error(message)
diff --git a/kameris/schemas/file_urls.json b/kameris/schemas/file_urls.json
index 05ae68e..c4ff512 100644
--- a/kameris/schemas/file_urls.json
+++ b/kameris/schemas/file_urls.json
@@ -2,19 +2,33 @@
"$schema": "http://json-schema.org/draft-04/schema#",
"type": "object",
"properties": {
- "metadata": {"$ref": "#/definitions/url_list"},
- "archives": {"$ref": "#/definitions/url_list"},
- "models": {"$ref": "#/definitions/url_list"}
+ "metadata": {
+ "type": "object",
+ "additionalProperties": {"$ref": "#/definitions/url"}
+ },
+ "archives": {
+ "type": "object",
+ "additionalProperties": {"$ref": "#/definitions/url"}
+ },
+ "models": {
+ "type": "object",
+ "additionalProperties": {
+ "type": "object",
+ "properties": {
+ "python2": {"$ref": "#/definitions/url"},
+ "python3": {"$ref": "#/definitions/url"}
+ },
+ "additionalProperties": false,
+ "required": ["python2", "python3"]
+ }
+ }
},
"additionalProperties": false,
"definitions": {
- "url_list": {
- "type": "object",
- "additionalProperties": {
- "type": "string",
- "pattern": "http(s)?://.*"
- }
+ "url": {
+ "type": "string",
+ "pattern": "http(s)?://.*"
}
}
}
diff --git a/kameris/subcommands/classify.py b/kameris/subcommands/classify.py
index 72c2b63..6e08a7e 100644
--- a/kameris/subcommands/classify.py
+++ b/kameris/subcommands/classify.py
@@ -25,7 +25,7 @@ def run(args):
model_url = args.model
else:
model_url = download_utils.url_for_file(args.model + '.mm-model',
- args.urls_file, 'model')
+ args.urls_file, 'models')
model_file = download_utils.open_url_cached(model_url, 'rb',
args.force_download)
diff --git a/kameris/utils/download_utils.py b/kameris/utils/download_utils.py
index 2d0a41f..9fcf909 100644
--- a/kameris/utils/download_utils.py
+++ b/kameris/utils/download_utils.py
@@ -8,6 +8,7 @@
import requests
from ruamel.yaml import YAML
from six.moves import urllib
+import sys
from tqdm import tqdm
from . import defaults, fs_utils, job_utils
@@ -53,7 +54,11 @@ def url_for_file(path, urls_file, filetype): # NOQA (cache line above)
))
filename = os.path.splitext(os.path.basename(path))[0]
- return urls[filetype][filename]
+ if filetype == 'models':
+ python_ver = 'python{}'.format(sys.version_info.major)
+ return urls[filetype][filename][python_ver]
+ else:
+ return urls[filetype][filename]
def open_url_cached(url, mode, force_download=False):
@@ -63,7 +68,7 @@ def open_url_cached(url, mode, force_download=False):
'cache')
fs_utils.mkdir_p(cache_dir)
- cache_key = hashlib.md5(url).hexdigest()
+ cache_key = hashlib.md5(url.encode('utf-8')).hexdigest()
cache_filename = os.path.join(cache_dir, cache_key)
if not force_download and os.path.exists(cache_filename):
log.info("file '%s' already downloaded, using cached version", url)