Merge pull request #4464 from broadinstitute/dev

Dev
broadinstitute · Oct 31, 2024 · d4df2c2 · d4df2c2
2 parents 424715b + c923ea1
commit d4df2c2
Show file tree

Hide file tree

Showing 18 changed files with 510 additions and 82 deletions.
diff --git a/.cloudbuild/seqr-vlm-docker.cloudbuild.yaml b/.cloudbuild/seqr-vlm-docker.cloudbuild.yaml
@@ -0,0 +1,11 @@
+steps:
+- name: 'gcr.io/kaniko-project/executor:v1.3.0'
+  args:
+  - --destination=gcr.io/seqr-project/seqr-vlm:${COMMIT_SHA}
+  - --destination=gcr.io/seqr-project/seqr-vlm:${_CUSTOM_BRANCH_TAG}
+  - --destination=gcr.io/seqr-project/seqr-vlm:latest
+  - --dockerfile=vlm/deploy/Dockerfile
+  - --cache=true
+  - --cache-ttl=168h
+
+timeout: 1800s
diff --git a/.dockerignore b/.dockerignore
@@ -6,6 +6,7 @@ static/*
 deploy/*
 !deploy/docker/seqr/*
 !deploy/docker/hail_search/*
+!deploy/docker/vlm/*
 hail_search/fixtures/*
 .git
 .vscode

diff --git a/.github/workflows/dev-vlm-release.yaml b/.github/workflows/dev-vlm-release.yaml
@@ -0,0 +1,64 @@
+name: vlm dev release
+on:
+  workflow_run:
+    workflows: ["VLM Unit Tests"]
+    types:
+      - completed
+    branches:
+      - dev
+
+permissions:
+  id-token: write
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    if: ${{ github.event.workflow_run.conclusion == 'success' }}
+    steps:
+      - name: checkout
+        uses: actions/checkout@v3
+        with:
+          ref: ${{ github.event.workflow_run.head_branch }}
+
+      - name: authenticate to google cloud
+        id: "auth"
+        uses: google-github-actions/auth@v0
+        with:
+          workload_identity_provider: "${{ secrets.WORKLOAD_IDENTITY_PROVIDER }}"
+          service_account: "${{ secrets.RUN_SA_EMAIL }}"
+
+      - name: "setup gcloud sdk"
+        uses: google-github-actions/setup-gcloud@v0
+
+      - name: Build and push images
+        run: |-
+          gcloud builds submit --quiet --substitutions="COMMIT_SHA=${{ github.event.workflow_run.head_sha }},_CUSTOM_BRANCH_TAG=gcloud-dev" --config .cloudbuild/seqr-vlm-docker.cloudbuild.yaml --gcs-log-dir=gs://seqr-github-actions-logs/logs .
+
+  helm_update:
+    runs-on: ubuntu-latest
+    needs: docker
+    steps:
+      - name: Retrieve tgg-helm repo for broad seqr chart
+        uses: actions/checkout@v3
+        with:
+          repository: broadinstitute/tgg-helm
+          token: ${{ secrets.SEQR_VERSION_UPDATE_TOKEN }}
+          ref: main
+          persist-credentials: false
+          fetch-depth: 0
+
+      - name: update image tag in the dev broad seqr chart
+        uses: mikefarah/yq@v4.22.1
+        with:
+          cmd: >
+            yq -i '.seqr-platform.vlm.image.tag = "${{ github.event.workflow_run.head_sha }}"' charts/dev-broad-seqr/values.yaml
+
+      - name: Commit and Push changes
+        uses: Andro999b/push@v1.3
+        with:
+          repository: broadinstitute/tgg-helm
+          branch: main
+          github_token: ${{ secrets.SEQR_VERSION_UPDATE_TOKEN }}
+          author_email: ${{ github.actor }}@users.noreply.github.com
+          author_name: tgg-automation
+          message: "Update VLM dev release docker tag to ${{ github.event.workflow_run.head_sha }}"
diff --git a/.github/workflows/docker-lint.yaml b/.github/workflows/docker-lint.yaml
@@ -10,6 +10,7 @@ on:
     paths:
       - deploy/docker/seqr/Dockerfile
       - hail_search/deploy/Dockerfile
+      - vlm/deploy/Dockerfile
       - .hadolint.yaml
       - .docker-compose.yaml
       - .github/workflows/docker-lint.yaml
@@ -21,6 +22,7 @@ on:
     paths:
       - deploy/docker/seqr/Dockerfile
       - hail_search/deploy/Dockerfile
+      - vlm/deploy/Dockerfile
       - .hadolint.yaml
       - .docker-compose.yaml
       - .github/workflows/docker-lint.yaml
@@ -31,10 +33,13 @@ jobs:
     steps:
       - uses: actions/checkout@v2
       - name: Validate docker compose
-        run: docker-compose -f docker-compose.yml config
+        run: docker compose -f docker-compose.yml config
       - uses: hadolint/hadolint-action@v1.5.0
         with:
           dockerfile: deploy/docker/seqr/Dockerfile
       - uses: hadolint/hadolint-action@v1.5.0
         with:
           dockerfile: hail_search/deploy/Dockerfile
+      - uses: hadolint/hadolint-action@v1.5.0
+        with:
+          dockerfile: vlm/deploy/Dockerfile
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -9,12 +9,16 @@ on:
       - dev
     paths-ignore:
       - 'hail_search/**'
-      - '.github/workflows/hail-search-unit-tests.yaml'
+      - 'vlm/**'
+      - '.github/workflows/*hail-search*.yaml'
+      - '.github/workflows/*vlm*.yaml'
   pull_request:
     types: [opened, synchronize, reopened]
     paths-ignore:
       - 'hail_search/**'
-      - '.github/workflows/hail-search-unit-tests.yaml'
+      - 'vlm/**'
+      - '.github/workflows/*hail-search*.yaml'
+      - '.github/workflows/*vlm*.yaml'
 
 jobs:
   python:

diff --git a/.github/workflows/vlm-unit-tests.yaml b/.github/workflows/vlm-unit-tests.yaml
@@ -0,0 +1,33 @@
+name: VLM Unit Tests
+
+# Run the test suite on pushes (incl. merges) to master and dev
+# Run the test suite when a PR is opened, pushed to, or reopened
+on:
+  push:
+    branches:
+      - master
+      - dev
+    paths:
+      - 'vlm/**'
+      - '.github/workflows/*vlm*.yaml'
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      - 'vlm/**'
+      - '.github/workflows/*vlm*.yaml'
+
+jobs:
+  vlm:
+    runs-on: ubuntu-latest
+    container: hailgenetics/hail:0.2.128
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Install dependencies
+        run: |
+          python3 -m pip install --upgrade pip wheel
+          pip install -r hail_search/requirements-test.txt
+      - name: Run coverage tests
+        run: |
+          coverage run --source="./vlm" --omit="./vlm/__main__.py" -m pytest vlm/
+          coverage report --fail-under=90
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ seqr consists of the following components:
 The seqr production instance runs on Google Kubernetes Engine (GKE) and data is loaded using Google Dataproc Spark clusters. 
 
 On-prem installs can be created using docker-compose:
-**[Local installs using docker-compose](deploy/LOCAL_INSTALL.md)**  
+ **[Local installs using docker-compose](deploy/LOCAL_INSTALL.md)**
 
 To set up seqr for local development, see instructions **[here](deploy/LOCAL_DEVELOPMENT_INSTALL.md)**  
 

diff --git a/deploy/LOCAL_INSTALL_HELM.md b/deploy/LOCAL_INSTALL_HELM.md
@@ -0,0 +1,118 @@
+## Starting and Updating *seqr*
+
+Detailed instructions for how to install and update *seqr* may be found in the [seqr-helm](https://github.com/broadinstitute/seqr-helm) repository. 
+
+### Configuring Authentication for seqr
+
+#### Username/password basic auth
+This is the default authentication mechanism for seqr. After seqr is running, you can run the following steps to create an inital superuser account. 
+All other user accounts can then be added through normal application use.
+
+```bash
+# Get the name of the running seqr pod
+kubectl get pod
+
+kubectl exec -it seqr-POD-ID -- /bin/bash
+./manage.py createsuperuser
+```
+
+#### Google OAuth2
+Using Google OAuth2 for authentication requires setting up a Google Cloud project and configuring the seqr instance 
+with the project's client ID and secret by setting the following environment variables in your [helm values overrides](https://github.com/broadinstitute/seqr-helm?tab=readme-ov-file#valuesenvironment-overrides):
+```yaml
+  seqr:
+    environment:
+      - SOCIAL_AUTH_GOOGLE_OAUTH2_CLIENT_ID=your-client-id
+      - SOCIAL_AUTH_GOOGLE_OAUTH2_SECRET=your-client-secret
+```
+Note that user accounts do NOT need to be associated with this Google Cloud 
+project in order to have access to seqr. User's emails must explicitly be added to at least one seqr project for them to
+gain any access to seqr, and any valid Gmail account can be used.
+
+#### Azure OAuth2
+Using Azure OAuth2 for authentication requires setting up an Azure tenant and configuring the seqr instance with the 
+tenant and it's client ID and secret by setting the following environment variables in your [helm values overrides](https://github.com/broadinstitute/seqr-helm?tab=readme-ov-file#valuesenvironment-overrides):
+```yaml
+  seqr:
+    environment:
+      - SOCIAL_AUTH_AZUREAD_V2_OAUTH2_CLIENT_ID=your-client-id
+      - SOCIAL_AUTH_AZUREAD_V2_OAUTH2_SECRET=your-client-secret
+      - SOCIAL_AUTH_AZUREAD_V2_OAUTH2_TENANT=your-tenant-id 
+```
+Note that user accounts must be directly associated with the Azure tenant in order to access seqr. Anyone with access
+to the tenant will automatically have access to seqr, although they will only be able to view those projects that they 
+have been added to.
+
+## Enabling Clingen Allele Registration
+- Turning on this feature will register your variants within the Clingen Allele Registry during VCF ingestion. The [Registry](https://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/landing) provides and maintains unique variant identifiers; enabling this feature will toggle several small features.
+- You will first need to [register](https://reg.clinicalgenome.org/cg-prof/new) and receive a login and password.
+- Create a kubernetes secret called `pipeline-secrets` with your login and password embedded:
+```bash
+kubectl create secret generic pipeline-secrets \
+  --from-literal=clingen_allele_registry_login='my-login'
+  --from-literal=clingen_allele_registry_password='my-password'
+```
+- Add secret references in your [values overrides](https://github.com/broadinstitute/seqr-helm?tab=readme-ov-file#valuesenvironment-overrides):
+```yaml
+pipeline-runner:
+  additionalSecrets:
+    - name: CLINGEN_ALLELE_REGISTRY_LOGIN
+      valueFrom:
+        secretKeyRef:
+          name: pipeline-secrets
+          key: clingen_allele_registry_login
+    - name: CLINGEN_ALLELE_REGISTRY_PASSWORD
+      valueFrom:
+        secretKeyRef:
+          name: pipeline-secrets
+          key: clingen_allele_registry_password
+```
+
+## Using the Load Data page to load VCF Callsets
+- Copy your vcf into the loading datasets directory on the node running your kubernetes cluster (`/var/seqr/seqr-loading-temp/`).  You should see your vcf present when listing files:
+```
+ls -h /var/seqr/seqr-loading-temp/
+loading_pipeline_queue  test.vcf.gz
+```
+- In the top header of *seqr*, click on the **Data Management** button.
+- In the subheader, click on **Load Data**.
+- Type the name of the callset path into the **Callset File Path** text box (without the directory prefix), and select the appropriate Sample Type (WES/WGS) and Genome Version (GRCh37/GRCh38) for your project.  The pipeline includes a sequence of validation steps to insure the validity of your VCF, but these may be skipped by enabling the **Skip Callset Validation**option.  We strongly recommend leaving validation enabled to ensure the quality of your analysis.
+- Click through to the next page and select your project from the **Projects to Load** dropdown, then click **Submit**.
+- If you wish to check the status of the loading request, you can click through to the **Pipeline Status** tab to view the loading pipeline interface.
+- Data should be loaded into the search backend automatically, usually within a few hours.
+
+## Loading RNASeq datasets
+
+Currently, seqr has a preliminary integration for RNA data, which requires the use of publicly available 
+pipelines run outside of the seqr platform. After these pipelines are run, the output must be annotated with metadata 
+from seqr to ensure samples are properly associated with the correct seqr families. After calling is completed, it can
+be added to seqr from the "Data Management" > "Rna Seq" page. You will need to provide the file path for the data and the 
+data type. Note that the file path can either be a gs:// path to a google bucket or as a local file stored in the `/var/seqr` folder. 
+
+The following data types are supported:
+
+#### Gene Expression
+
+seqr accepts normalized expression TPMs from STAR or RNAseqQC. TSV files should have the following columns:
+
+- sample_id
+- project
+- gene_id
+- TPM
+- tissue
+
+#### Expression Outliers
+
+seqr accepts gene expression outliers from OUTRIDER.  TSV files should have the following columns:
+
+- sampleID
+- geneID
+- pValue
+- padjust
+- zScore
+
+#### IGV
+
+Splice junctions (.junctions.bed.gz) and coverage (.bigWig) can be visualized in seqr using IGV.
+See [ReadViz Setup](READVIZ_SETUP.md) for 
+instructions on adding this data, as the process is identical for all IGV tracks. 
diff --git a/deploy/READVIZ_SETUP.md b/deploy/READVIZ_SETUP.md
@@ -9,6 +9,8 @@ place the bam/cram and index files in the `./data/readviz/` sub-directory that w
 the docker-compose.yml file. When you start seqr, docker-compose will mount the `./data/readviz/` directory to a `/readviz`
 top-level directory inside the seqr container.
 
+If running seqr using the [`seqr-helm` charts](https://github.com/broadinstitute/seqr-helm), place the bam/cram and index files inside `/var/seqr/seqr-static-media`.
+
 1) Create a tab-delimited or comma-delimited text file - let's call it `bam_paths.tsv` - with these 2 columns (and no 
 header line):