From 1b3e08025b8fcae39e9c9fe70e1dcde97707ffcd Mon Sep 17 00:00:00 2001 From: tina-z-jia <145156075+tina-z-jia@users.noreply.github.com> Date: Thu, 16 Jan 2025 10:34:33 -0800 Subject: [PATCH] esm 3.1.3 release (#184) --- CONTRIBUTIONS.md | 23 +- LICENSE.md | 23 ++ README.md | 285 +++++++-------- cookbook/local/README.md | 1 + .../local/open_generate.ipynb | 0 {examples => cookbook/local}/raw_forwards.py | 0 cookbook/snippets/README.md | 1 + .../snippets/esm3.py | 14 +- .../snippets/esmc.py | 7 + .../snippets/fold_invfold.py | 54 ++- .../tutorials/1_esmprotein.ipynb | 251 +++++++------ cookbook/tutorials/2_embed.ipynb | 342 ++++++++++++++++++ .../tutorials/3_gfp_design.ipynb | 0 .../tutorials/4_forge_generate.ipynb | 4 +- cookbook/tutorials/README.md | 8 + esm/__init__.py | 2 +- esm/models/esmc.py | 8 +- esm/models/function_decoder.py | 13 +- esm/sdk/api.py | 1 + esm/sdk/forge.py | 29 +- esm/utils/constants/models.py | 4 + esm/utils/generation.py | 5 +- esm/utils/generation_test.py | 4 +- esm/utils/misc.py | 8 + esm/utils/sampling.py | 5 +- examples/forge_generate.py | 13 - pyproject.toml | 2 +- tools/generate.ipynb | 124 +++---- tools/invfold.ipynb | 211 +++++++---- tools/predict.ipynb | 178 +++++---- 30 files changed, 1072 insertions(+), 548 deletions(-) create mode 100644 cookbook/local/README.md rename examples/generate.ipynb => cookbook/local/open_generate.ipynb (100%) rename {examples => cookbook/local}/raw_forwards.py (100%) create mode 100644 cookbook/snippets/README.md rename examples/local_generate.py => cookbook/snippets/esm3.py (93%) rename examples/esmc_examples.py => cookbook/snippets/esmc.py (87%) rename examples/folding_inverse_folding_example.py => cookbook/snippets/fold_invfold.py (62%) rename examples/esmprotein.ipynb => cookbook/tutorials/1_esmprotein.ipynb (82%) create mode 100644 cookbook/tutorials/2_embed.ipynb rename examples/gfp_design.ipynb => cookbook/tutorials/3_gfp_design.ipynb (100%) rename examples/forge_generate.ipynb => cookbook/tutorials/4_forge_generate.ipynb (99%) create mode 100644 cookbook/tutorials/README.md delete mode 100644 examples/forge_generate.py diff --git a/CONTRIBUTIONS.md b/CONTRIBUTIONS.md index 5d06b52..e82ed15 100644 --- a/CONTRIBUTIONS.md +++ b/CONTRIBUTIONS.md @@ -27,8 +27,7 @@ python -c 'from huggingface_hub import login; login()' # Ensure package can correctly interact with forge. # This will require an API key from forge.evolutionaryscale.ai -ESM_API_KEY=$ESM_API_KEY PYTHONPATH='.' python examples/forge_generate.py - +ESM_API_KEY=$ESM_API_KEY PYTHONPATH='.' python cookbook/snippets/esm3.py ``` @@ -36,21 +35,17 @@ ESM_API_KEY=$ESM_API_KEY PYTHONPATH='.' python examples/forge_generate.py 1. Ensure the following scripts run without errors. Most have a pip install command installing the published `esm` package - comment this out so your release candidate version is tested and not the already published version. ```bash -ESM_API_KEY=$ESM_API_KEY PYTHONPATH='.' python examples/forge_generate.py +ESM_API_KEY=$ESM_API_KEY PYTHONPATH='.' python cookbook/snippets/esm3.py pip install treon -treon examples/esmprotein.ipynb -treon examples/gfp_design.ipynb +treon cookbook/tutorials/1_esmprotein.ipynb +treon cookbook/tutorials/2_embed.ipynb +treon cookbook/tutorials/3_gfp_design.ipynb +treon cookbook/tutorials/4_forge_generate.ipynb # requires a GPU -treon examples/generate.ipynb -python examples/local_generate.py -python examples/raw_forwards.py +python cookbook/snippets/esm3.py +python cookbook/snippets/esmc.py +python cookbook/local/raw_forwards.py ``` - -`examples/esmprotein.ipynb` works. Remember to skip running the first cell - it will reinstall stock esm instead of your deployed version. - -3. Ensure `examples/generate.ipynb` works. Note this notebook will require a node with a GPU that can fit ESM3 small open. - -4. Ensure diff --git a/LICENSE.md b/LICENSE.md index 9798d1d..eaccb29 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -11,3 +11,26 @@ Here are the different licenses that govern access to the ESM codebase and the m | Governed by API Agreements (See Below) | API access to all models, including API-only models (ESM3 family, ESM C 6B) | | [Forge API Terms of Use](https://www.evolutionaryscale.ai/policies/terms-of-use) | Free non-commercial API access via Forge to all models including API-only models (ESM3 family, ESM C 6B) | | [Cambrian Inference Clickthrough License Agreement](https://www.evolutionaryscale.ai/policies/cambrian-inference-clickthrough-license-agreement) | Commercial Inference via SageMaker for all ESM C models | + + +# How can I access the models and which licenses apply? + +The models can be accessed in three different ways, each with its own licensing terms. + +1. **Code and weights** via GitHub and HuggingFace are available under either a [non-commercial](https://www.evolutionaryscale.ai/policies/cambrian-non-commercial-license-agreement) (ESM C 600M, ESM3-small-open) or an [open license](https://www.evolutionaryscale.ai/policies/cambrian-open-license-agreement) (codebase, ESM C 300M). + * **Building with ESM encouraged**: You can use embeddings, model predictions, fine-tune the models and use components of both the models and code. We strongly encourage anyone to build on ESM C and ESM3! Just remember to maintain the same license terms and release under the ESM name. +2. **Free non-commercial inference API** via Forge. All models are available this way, with free credits granted to students and researchers. We want to enable academics under [non-commercial Terms of Use](https://www.evolutionaryscale.ai/policies/terms-of-use), which mirrors the non-commercial license. +3. **Paid commercial Inference API** for commercial use via SageMaker (Forge coming soon). All ESM C models are available this way to commercial entities for commercial use under a [clickthrough license agreement](https://www.evolutionaryscale.ai/policies/cambrian-inference-clickthrough-license-agreement) with few restrictions. + * In broad strokes: standard commercial use like developing molecules and developing downstream ML models and methods with the model is allowed, while training competing models on the API outputs is not. + * Note: For ESM3 commercial use, reach out to [bd@evolutionaryscale.ai](mailto:bd@evolutionaryscale.ai) + +### What changed with the release of ESM C? + +We introduced a [clickthrough license agreement](https://www.evolutionaryscale.ai/policies/cambrian-inference-clickthrough-license-agreement) to enable frictionless commercial use of ESM C. + +We introduced the new [Cambrian Open License](https://www.evolutionaryscale.ai/policies/cambrian-open-license-agreement) for ESM C 300M, and at the same time moved all code in the [`esm` repo](https://github.com/evolutionaryscale/esm) under that permissive license. + +The [Cambrian non-commercial license](https://www.evolutionaryscale.ai/policies/cambrian-non-commercial-license-agreement) is largely based on the original [ESM3 Community License Agreement](https://www.evolutionaryscale.ai/policies/community-license-agreement), but removed the clause that restricted drug development, added the naming requirement, and extended the meaning of “Derivative Work” to allow training on model outputs. Just remember to release models and methods built on ESM under the same license. +These changes are meant to remove potential gray areas and points of friction for researchers building with ESM. + +Finally, The ESM3-open-small model has been moved under the [Cambrian non-commercial license](https://www.evolutionaryscale.ai/policies/cambrian-non-commercial-license-agreement). diff --git a/README.md b/README.md index 531459b..b7db77a 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,20 @@ - [Installation ](#installation-) -- [ESM C ](#esm-c-) - - [ESM C Local Models via GitHub ](#esm-c-local-models-via-github-) - - [Using ESM C 6B via Forge API](#using-esm-c-6b-via-forge-api) - - [ESM C via Forge API for Free Non-Commercial Use ](#esm-c-via-forge-api-for-free-non-commercial-use--) - - [ESM C via SageMaker for Commercial Use ](#esm-c-via-sagemaker-for-commercial-use--) - - [ESM C Example Usage](#esm-c-example-usage) -- [ESM 3 ](#esm-3--) - - [Quickstart for ESM3-open ](#quickstart-for-esm3-open-) - - [EvolutionaryScale Forge: Access to larger ESM3 models](#evolutionaryscale-forge-access-to-larger-esm3-models) +- [ESM 3](#esm-3-) + - [Quickstart for ESM3 Open](#esm3-quickstart-) + - [ESM3 98B via Forge API](#esm3-forge) - [ESM3 Example Usage](#esm3-example-usage) -- [Responsible Development ](#responsible-development-) -- [Licenses ](#licenses--) - - [How can I access the models and which licenses apply?](#how-can-i-access-the-models-and-which-licenses-apply) - - [What changed with the release of ESM C?](#what-changed-with-the-release-of-esm-c) +- [ESM C](#esm-c-) + - [Quickstart for ESM C Open Models](#esm-c-open-) + - [ESM C 6B via Forge API](#esm-c-forge-) + - [ESM C via SageMaker for Commercial Use ](#esm-c-sagemaker-) + - [ESM C Example Usage](#esm-c-example-) +- [Responsible Development](#responsible-development-) +- [Licenses](#licenses-) +- [Citations ](#citations-) +This repository contains flagship protein models for EvolutionaryScale, as well as access to the API. [ESM3](https://www.evolutionaryscale.ai/papers/esm3-simulating-500-million-years-of-evolution-with-a-language-model) is our flagship multimodal protein generative model, and can be used for generation and prediction tasks. [ESM C](https://www.evolutionaryscale.ai/blog/esm-cambrian) is our best protein representation learning model, and can be used to embed protein sequences. + ## Installation To get started with ESM, install the python library using pip: @@ -23,18 +23,93 @@ To get started with ESM, install the python library using pip: pip install esm ``` -## ESM C -[ESM Cambrian](https://www.evolutionaryscale.ai/blog/esm-cambrian) is a parallel model family to our flagship ESM3 generative models. While ESM3 focuses on controllable generation of proteins for therapeutic and many other applications, ESM C focuses on creating representations of the underlying biology of proteins. +## ESM 3 -ESM C comes with major performance benefits over ESM2. The 300M parameter ESM C delivers similar performance to ESM2 650M with dramatically reduced memory requirements and faster inference. The 600M parameter ESM C rivals the 3B parameter ESM2 and approaches the capabilities of the 15B model, delivering frontier performance with far greater efficiency. The 6B parameter ESM C sets a new benchmark, outperforming the best ESM2 models by a wide margin. +[ESM3](https://www.evolutionaryscale.ai/papers/esm3-simulating-500-million-years-of-evolution-with-a-language-model) is a frontier generative model for biology, able to jointly reason across three fundamental biological properties of proteins: sequence, structure, and function. These three data modalities are represented as tracks of discrete tokens at the input and output of ESM3. You can present the model with a combination of partial inputs across the tracks, and ESM3 will provide output predictions for all the tracks. -ESM C models are available immediately for academic and commercial use under a new license structure designed to promote openness and enable scientists and builders. You can find the high level take-away of the license structure in the [Licenses](#licenses) section of this page, and complete license details in [LICENSE.md](LICENSE.md). +ESM3 is a _generative_ masked language model. You can prompt it with partial sequence, structure, and function keywords, and iteratively sample masked positions until all positions are unmasked. This iterative sampling is what the `.generate()` function does. -You can use the following guides to start using ESM C models today, either running the model locally, [the Forge API](https://forge.evolutionaryscale.ai/) and [AWS SageMaker](https://aws.amazon.com/marketplace/seller-profile?id=seller-iw2nbscescndm). + +ESM3 Diagram -### ESM C Local Models via GitHub -The code and weights for the ESM C 300M model are available under the Cambrian Open [license agreement](#licenses). The weights for the ESM C 600M model are available under the Cambrian Non-Commercial [license agreement](#licenses). +The ESM3 architecture is highly scalable due to its transformer backbone and all-to-all reasoning over discrete token sequences. At its largest scale, ESM3 was trained with 1.07e24 FLOPs on 2.78 billion proteins and 771 billion unique tokens, and has 98 billion parameters. +Learn more by reading the [blog post](https://www.evolutionaryscale.ai/blog/esm3-release) and [the pre-print (Hayes et al., 2024)](https://www.evolutionaryscale.ai/papers/esm3-simulating-500-million-years-of-evolution-with-a-language-model). +ESM3-open, with 1.4B parameters, is the smallest and fastest model in the family. + +### Quickstart for ESM3-open + +``` +pip install esm +``` + +The weights are stored on HuggingFace Hub under [HuggingFace/EvolutionaryScale/esm3](https://huggingface.co/EvolutionaryScale/esm3). + +```py +from huggingface_hub import login +from esm.models.esm3 import ESM3 +from esm.sdk.api import ESM3InferenceClient, ESMProtein, GenerationConfig + +# Will instruct you how to get an API key from huggingface hub, make one with "Read" permission. +login() + +# This will download the model weights and instantiate the model on your machine. +model: ESM3InferenceClient = ESM3.from_pretrained("esm3-open").to("cuda") # or "cpu" + +# Generate a completion for a partial Carbonic Anhydrase (2vvb) +prompt = "___________________________________________________DQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPP___________________________________________________________" +protein = ESMProtein(sequence=prompt) +# Generate the sequence, then the structure. This will iteratively unmask the sequence track. +protein = model.generate(protein, GenerationConfig(track="sequence", num_steps=8, temperature=0.7)) +# We can show the predicted structure for the generated sequence. +protein = model.generate(protein, GenerationConfig(track="structure", num_steps=8)) +protein.to_pdb("./generation.pdb") +# Then we can do a round trip design by inverse folding the sequence and recomputing the structure +protein.sequence = None +protein = model.generate(protein, GenerationConfig(track="sequence", num_steps=8)) +protein.coordinates = None +protein = model.generate(protein, GenerationConfig(track="structure", num_steps=8)) +protein.to_pdb("./round_tripped.pdb") +``` + +Congratulations, you just generated your first proteins with ESM3! + +### EvolutionaryScale Forge: Access to larger ESM3 models + + +You can access all scales of ESM3 models [EvolutionaryScale Forge](https://forge.evolutionaryscale.ai). + +We encourage users to interact with the Forge API through the python `esm` library instead of the command line. +The python interface enables you to interactively load proteins, build prompts, and inspect generated proteins +with the `ESMProtein` and config classes used to interact with the local model. + +In any example script you can replace a local `ESM3` model with a Forge API client: + +```py +# Instead of loading the model locally on your machine: +model: ESM3InferenceClient = ESM3.from_pretrained("esm3_sm_open_v1").to("cuda") # or "cpu" +# just replace the line with this: +model: ESM3InferenceClient = esm.sdk.client("esm3-medium-2024-08", token="") +# and now you're interfacing with the model running on our remote servers. +... +``` + +and the exact same code will work. +This enables a seamless transition from smaller and faster models, to our largest and most capable protein language models for protein design work. + +### ESM3 Example Usage + + +Check out our [tutorials](./cookbook/tutorials) to learn how to use ESM3. + +## ESM C +[ESM Cambrian](https://www.evolutionaryscale.ai/blog/esm-cambrian) is a parallel model family to our flagship ESM3 generative models. While ESM3 focuses on controllable generation of proteins, ESM C focuses on creating representations of the underlying biology of proteins. + +ESM C is designed as a drop-in replacement for ESM2 and comes with major performance benefits. The 300M parameter ESM C delivers similar performance to ESM2 650M with dramatically reduced memory requirements and faster inference. The 600M parameter ESM C rivals the 3B parameter ESM2 and approaches the capabilities of the 15B model, delivering frontier performance with far greater efficiency. The 6B parameter ESM C outperforms the best ESM2 models by a wide margin. + +ESM C can be run locally, via [the Forge API](https://forge.evolutionaryscale.ai/) or through [AWS SageMaker](https://aws.amazon.com/marketplace/seller-profile?id=seller-iw2nbscescndm). + +### Quickstart for ESM C Open Models When running the code below, a pytorch model will be instantiated locally on your machine, with the weights downloaded from the [HuggingFace hub](https://huggingface.co/EvolutionaryScale). ```py from esm.models.esmc import ESMC @@ -58,11 +133,9 @@ pip install flash-attn --no-build-isolation You can also disable flash-attn by passing ``use_flash_attn=False`` to utils like ``ESMC_300M_202412``. -### Using ESM C 6B via Forge API -### ESM C via Forge API for Free Non-Commercial Use +### ESM C 6B via Forge API -The ESM C model family, including ESMC 6B, are accessible via EvolutionaryScale Forge for free [non-commercial use](#licenses). -Apply for access and copy the API token from the console by first visiting https://forge.evolutionaryscale.ai. +Apply for access and copy the API token from the console by first visiting [Forge](https://forge.evolutionaryscale.ai). With the code below, a local python client talks to the model inference server hosted by EvolutionaryScale. @@ -83,8 +156,8 @@ Remember to replace `` with your actual Forge access token. ### ESM C via SageMaker for Commercial Use -ESM C models are also available on Amazon SageMaker under the Cambrian Inference Clickthrough License Agreement. -Under this license agreement models are available for broad commercial use to commercial entities. +ESM C models are also available on Amazon SageMaker under the [Cambrian Inference Clickthrough License Agreement](https://www.evolutionaryscale.ai/policies/cambrian-inference-clickthrough-license-agreement). +Under this license agreement, models are available for broad use for commercial entities. You will need an admin AWS access to an AWS account to follow these instructions. To deploy, first we need to deploy the AWS package: @@ -97,12 +170,11 @@ You will need an admin AWS access to an AWS account to follow these instructions 7. Click "Launch CloudFormation Template". This takes 15 to 25 minutes depending on model size. 8. On the "Quick create stack" page, ensure the stack name and endpoint names are not already used. You can check existing stack names [here](https://console.aws.amazon.com/cloudformation/home/stacks) and existing endpoint names [here](https://us-east-1.console.aws.amazon.com/sagemaker/home?region=us-east-1#/endpoints). -The Sagemaker deployment of the model now lives on a dedicated GPU instance inside your AWS environment, and will be billed directly to your AWS account. +The SageMaker deployment of the model now lives on a dedicated GPU instance inside your AWS environment, and will be billed directly to your AWS account. Make sure to remember to shut down the instance after you stop using it. Find the CloudFormation stack you created [here](https://us-east-1.console.aws.amazon.com/cloudformation/home), select it, and then click "Delete" to clean up all resources. -After creating the endpoint, you can create a sagemaker client and use it the same way as a forge client. They share the same API. -The local python client talks to the Sagemaker endpoint you just deployed, which runs on an instance with a GPU to run model inference. - +After creating the endpoint, you can create a SageMaker client and use it the same way as a Forge client. They share the same API. +The local python client talks to the SageMaker endpoint you just deployed, which runs on an instance with a GPU to run model inference. Ensure that the code below runs in an environment that has AWS credentials available for the account which provisioned SageMaker resources. Learn more about general AWS credential options [here](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-authentication.html#cli-chap-authentication-precedence). @@ -126,103 +198,9 @@ print(logits_output.logits, logits_output.embeddings) ``` ### ESM C Example Usage - -Look at [esmc_examples.py](./examples/esmc_examples.py) for the standard usage (extracting embeddings and model amino acid prediction). - -More coming soon. - -## ESM 3 - -[ESM3](https://www.evolutionaryscale.ai/papers/esm3-simulating-500-million-years-of-evolution-with-a-language-model) is a frontier generative model for biology, able to jointly reason across three fundamental biological properties of proteins: sequence, structure, and function. These three data modalities are represented as tracks of discrete tokens at the input and output of ESM3. You can present the model with a combination of partial inputs across the tracks, and ESM3 will provide output predictions for all the tracks. + -ESM3 is a _generative_ masked language model. You can prompt it with partial sequence, structure, and function keywords, and iteratively sample masked positions until all positions are unmasked. This iterative sampling is what the `.generate()` function does. - - -ESM3 Diagram - -The ESM3 architecture is highly scalable due to its transformer backbone and all-to-all reasoning over discrete token sequences. At its largest scale, ESM3 was trained with 1.07e24 FLOPs on 2.78 billion proteins and 771 billion unique tokens, and has 98 billion parameters. -Learn more by reading the [blog post](https://www.evolutionaryscale.ai/blog/esm3-release) and [the pre-print (Hayes et al., 2024)](https://www.evolutionaryscale.ai/papers/esm3-simulating-500-million-years-of-evolution-with-a-language-model). - -Here we present `esm3-open-small`. With 1.4B parameters it is the smallest and fastest model in the family. -ESM3-open is available under the [Cambrian non-commercial license agreement](https://www.evolutionaryscale.ai/policies/cambrian-non-commercial-license-agreement), as outlined in `LICENSE.md` (note: updated with ESM C release). -Visit our [Discussions page](https://github.com/evolutionaryscale/esm/discussions) to get in touch, provide feedback, ask questions or share your experience with ESM3! - -### Quickstart for ESM3-open - -``` -pip install esm -``` - -In order to download the weights, we require users to accept our non-commercial license. -The weights are stored on HuggingFace Hub under [HuggingFace/EvolutionaryScale/esm3](https://huggingface.co/EvolutionaryScale/esm3). -Please create an account and accept the license. - -```py -from huggingface_hub import login -from esm.models.esm3 import ESM3 -from esm.sdk.api import ESM3InferenceClient, ESMProtein, GenerationConfig - -# Will instruct you how to get an API key from huggingface hub, make one with "Read" permission. -login() - -# This will download the model weights and instantiate the model on your machine. -model: ESM3InferenceClient = ESM3.from_pretrained("esm3-open").to("cuda") # or "cpu" - -# Generate a completion for a partial Carbonic Anhydrase (2vvb) -prompt = "___________________________________________________DQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPP___________________________________________________________" -protein = ESMProtein(sequence=prompt) -# Generate the sequence, then the structure. This will iteratively unmask the sequence track. -protein = model.generate(protein, GenerationConfig(track="sequence", num_steps=8, temperature=0.7)) -# We can show the predicted structure for the generated sequence. -protein = model.generate(protein, GenerationConfig(track="structure", num_steps=8)) -protein.to_pdb("./generation.pdb") -# Then we can do a round trip design by inverse folding the sequence and recomputing the structure -protein.sequence = None -protein = model.generate(protein, GenerationConfig(track="sequence", num_steps=8)) -protein.coordinates = None -protein = model.generate(protein, GenerationConfig(track="structure", num_steps=8)) -protein.to_pdb("./round_tripped.pdb") -``` - -Congratulations, you just generated your first proteins with ESM3! - -### EvolutionaryScale Forge: Access to larger ESM3 models - - -You can apply for beta access to the full family of larger and higher capability ESM3 models at [EvolutionaryScale Forge](https://forge.evolutionaryscale.ai). - -We encourage users to interact with the Forge API through the python `esm` library instead of the command line. -The python interface enables you to interactively load proteins, build prompts, and inspect generated proteins -with the `ESMProtein` and config classes used to interact with the local model. - -In any example script try to replace a local `ESM3` model with a Forge API client: - -```py -# Instead of loading the model locally on your machine: -model: ESM3InferenceClient = ESM3.from_pretrained("esm3_sm_open_v1").to("cuda") # or "cpu" -# just replace the line with this: -model: ESM3InferenceClient = esm.sdk.client("esm3-medium-2024-08", token="") -# and now you're interfacing with the model running on our remote servers. -... -``` - -and the exact same code will work. -This enables a seamless transition from smaller and faster models, to our large 98B protein language models for protein design work. - -### ESM3 Example Usage - -Let's explore some more advanced prompting with the help of our [notebooks and scripts](examples/). - -`generate.ipynb` will walk through two prompting examples (scaffolding and secondary structure editing) using the open model: -[](https://colab.research.google.com/github/evolutionaryscale/esm/blob/main/examples/generate.ipynb) - -`gfp_design.ipynb` will walk through the more complex generation procedure we used to design esmGFP: -[](https://colab.research.google.com/github/evolutionaryscale/esm/blob/main/examples/gfp_design.ipynb) - -We also provide example scripts that show common workflows under `examples/`: - -- [local_generate.py](./examples/local_generate.py) shows how simple and elegant common tasks are: it shows folding, inverse folding and chain of thought generation, all by calling just `model.generate()` for iterative decoding. -- [seqfun_struct.py](./examples/seqfun_struct.py) shows direct use of the model as a standard pytorch model with a simple model `forward` call. +Check out our [tutorials](./cookbook/tutorials) to learn how to use ESM C. ## Responsible Development @@ -238,27 +216,44 @@ The core tenets of our framework are With this in mind, we have performed a variety of mitigations for `esm3-sm-open-v1`, detailed in our [paper](https://www.evolutionaryscale.ai/papers/esm3-simulating-500-million-years-of-evolution-with-a-language-model) ## Licenses -The code and model weights of ESM3 and ESM C are available under a mixture of non-commercial and permissive commercial licenses. -This summary provides a high-level overview. For complete license details, see [LICENSE.md](./LICENSE.md). - -### How can I access the models and which licenses apply? - -The models can be accessed in three different ways, each with its own licensing terms. -1. **Code and weights** via GitHub and HuggingFace are available under either a [non-commercial](https://www.evolutionaryscale.ai/policies/cambrian-non-commercial-license-agreement) (ESM C 600M, ESM3-small-open) or an [open license](https://www.evolutionaryscale.ai/policies/cambrian-open-license-agreement) (codebase, ESM C 300M). - 1. **Building with ESM encouraged**: You can use embeddings, model predictions, fine-tune the models and use components of both the models and code. We strongly encourage anyone to build on ESM C and ESM3! Just remember to maintain the same license terms and release under the ESM name. -2. **Free non-commercial inference API** via Forge. All models are available this way, with free credits granted to students and researchers. We want to enable academics under [non-commercial Terms of Use](https://www.evolutionaryscale.ai/policies/terms-of-use), which mirrors the non-commercial license. -3. **Paid commercial Inference API** for commercial use via SageMaker (Forge coming soon). All ESM C models are available this way to commercial entities for commercial use under a [clickthrough license agreement](https://www.evolutionaryscale.ai/policies/cambrian-inference-clickthrough-license-agreement) with few restrictions. - 1. In broad strokes: standard commercial use like developing molecules and developing downstream ML models and methods with the model is allowed, while training competing models on the API outputs is not. - 2. Note: For ESM3 commercial use, reach out to [bd@evolutionaryscale.ai](mailto:bd@evolutionaryscale.ai) +The code and model weights of ESM3 and ESM C are available under a mixture of non-commercial and permissive commercial licenses. For complete license details, see [LICENSE.md](./LICENSE.md). -### What changed with the release of ESM C? +## Citations +If you use ESM in your work, please cite one of the following: -We introduced a [clickthrough license agreement](https://www.evolutionaryscale.ai/policies/cambrian-inference-clickthrough-license-agreement) to enable frictionless commercial use of ESM C. - -We introduced the new [Cambrian Open License](https://www.evolutionaryscale.ai/policies/cambrian-open-license-agreement) for ESM C 300M, and at the same time moved all code in the [`esm` repo](https://github.com/evolutionaryscale/esm) under that permissive license. +#### ESM3 +``` +@article {hayes2024simulating, + author = {Hayes, Thomas and Rao, Roshan and Akin, Halil and Sofroniew, Nicholas J. and Oktay, Deniz and Lin, Zeming and Verkuil, Robert and Tran, Vincent Q. and Deaton, Jonathan and Wiggert, Marius and Badkundri, Rohil and Shafkat, Irhum and Gong, Jun and Derry, Alexander and Molina, Raul S. and Thomas, Neil and Khan, Yousuf A. and Mishra, Chetan and Kim, Carolyn and Bartie, Liam J. and Nemeth, Matthew and Hsu, Patrick D. and Sercu, Tom and Candido, Salvatore and Rives, Alexander}, + title = {Simulating 500 million years of evolution with a language model}, + year = {2024}, + doi = {10.1101/2024.07.01.600583}, + URL = {https://doi.org/10.1101/2024.07.01.600583}, + journal = {bioRxiv} +} +``` -The [Cambrian non-commercial license](https://www.evolutionaryscale.ai/policies/cambrian-non-commercial-license-agreement) is largely based on the original [ESM3 Community License Agreement](https://www.evolutionaryscale.ai/policies/community-license-agreement), but removed the clause that restricted drug development, added the naming requirement, and extended the meaning of “Derivative Work” to allow training on model outputs. Just remember to release models and methods built on ESM under the same license. -These changes are meant to remove potential gray areas and points of friction for researchers building with ESM. +#### ESM C +``` +@misc{esm2024cambrian, + author = {{ESM Team}}, + title = {ESM Cambrian: Revealing the mysteries of proteins with unsupervised learning}, + year = {2024}, + publisher = {EvolutionaryScale Website}, + url = {https://evolutionaryscale.ai/blog/esm-cambrian}, + urldate = {2024-12-04} +} +``` -Finally, The ESM3-open-small model has been moved under the Cambrian non-commercial license. +#### ESM Github (Code / Weights) +``` +@software{evolutionaryscale_2024, + author = {{EvolutionaryScale Team}}, + title = {evolutionaryscale/esm}, + year = {2024}, + publisher = {Zenodo}, + doi = {10.5281/zenodo.14219303}, + URL = {https://doi.org/10.5281/zenodo.14219303} +} +``` diff --git a/cookbook/local/README.md b/cookbook/local/README.md new file mode 100644 index 0000000..3292a01 --- /dev/null +++ b/cookbook/local/README.md @@ -0,0 +1 @@ +Examples utilizing the open model run locally. diff --git a/examples/generate.ipynb b/cookbook/local/open_generate.ipynb similarity index 100% rename from examples/generate.ipynb rename to cookbook/local/open_generate.ipynb diff --git a/examples/raw_forwards.py b/cookbook/local/raw_forwards.py similarity index 100% rename from examples/raw_forwards.py rename to cookbook/local/raw_forwards.py diff --git a/cookbook/snippets/README.md b/cookbook/snippets/README.md new file mode 100644 index 0000000..34e7a33 --- /dev/null +++ b/cookbook/snippets/README.md @@ -0,0 +1 @@ +Snippets of ESM3 usage that you can copy and paste directly into your scripts. diff --git a/examples/local_generate.py b/cookbook/snippets/esm3.py similarity index 93% rename from examples/local_generate.py rename to cookbook/snippets/esm3.py index 3b718b3..1bc3629 100644 --- a/examples/local_generate.py +++ b/cookbook/snippets/esm3.py @@ -1,11 +1,15 @@ +import os + import torch from esm.models.esm3 import ESM3 +from esm.sdk import client from esm.sdk.api import ( ESM3InferenceClient, ESMProtein, ESMProteinError, ESMProteinTensor, + ForwardAndSampleOutput, GenerationConfig, LogitsConfig, LogitsOutput, @@ -195,4 +199,12 @@ def main(client: ESM3InferenceClient): if __name__ == "__main__": - main(ESM3.from_pretrained("esm3_sm_open_v1")) + if os.environ.get("ESM_API_KEY", ""): + print("ESM_API_KEY found. Trying to use model from Forge...") + main(client()) + else: + print("No ESM_API_KEY found. Trying to load model locally...") + print( + "TO try this script with a Forge API, please run ESM_API_KEY=your_api_key python esm3.py" + ) + main(ESM3.from_pretrained("esm3_sm_open_v1")) diff --git a/examples/esmc_examples.py b/cookbook/snippets/esmc.py similarity index 87% rename from examples/esmc_examples.py rename to cookbook/snippets/esmc.py index 3bfa788..ce189d9 100644 --- a/examples/esmc_examples.py +++ b/cookbook/snippets/esmc.py @@ -29,6 +29,13 @@ def main(client: ESMCInferenceClient): f"Client returned logits with shape: {output.logits.sequence.shape}, embeddings with shape: {output.embeddings.shape}, and hidden states with shape {output.hidden_states.shape}" ) + # request a specific hidden layer. + output = client.logits( + protein_tensor, LogitsConfig(return_hidden_states=True, ith_hidden_layer=1) + ) + assert output.hidden_states is not None + print(f"Client returned hidden states with shape {output.hidden_states.shape}") + def raw_forward(model: ESMC): protein = ESMProtein(sequence="AAAAA") diff --git a/examples/folding_inverse_folding_example.py b/cookbook/snippets/fold_invfold.py similarity index 62% rename from examples/folding_inverse_folding_example.py rename to cookbook/snippets/fold_invfold.py index 902c576..d4c1a1d 100644 --- a/examples/folding_inverse_folding_example.py +++ b/cookbook/snippets/fold_invfold.py @@ -1,15 +1,31 @@ +import os from typing import cast import numpy as np -from examples.local_generate import get_sample_protein from esm.sdk.api import ( ESM3InferenceClient, ESMProtein, GenerationConfig, InverseFoldingConfig, ) -from esm.sdk.forge import SequenceStructureForgeInferenceClient +from esm.sdk.forge import ( + ESM3ForgeInferenceClient, + SequenceStructureForgeInferenceClient, +) +from esm.utils.structure.protein_chain import ProteinChain +from esm.utils.types import FunctionAnnotation + + +def get_sample_protein() -> ESMProtein: + protein = ProteinChain.from_rcsb("1utn") + protein = ESMProtein.from_protein_chain(protein) + protein.function_annotations = [ + # Peptidase S1A, chymotrypsin family: https://www.ebi.ac.uk/interpro/structure/PDB/1utn/ + FunctionAnnotation(label="peptidase", start=100, end=114), + FunctionAnnotation(label="chymotrypsin", start=190, end=202), + ] + return protein def convert_none_to_nan(data): @@ -22,22 +38,23 @@ def convert_none_to_nan(data): return data -def main( +def fold( sequence_structure_client: SequenceStructureForgeInferenceClient, esm3_client: ESM3InferenceClient, ): - # Folding with esm3 client protein = get_sample_protein() protein.coordinates = None protein.function_annotations = None protein.sasa = None assert protein.sequence is not None, "Protein sequence must be set to fold" + # Folding with esm3 client config = GenerationConfig(track="structure", num_steps=1, temperature=0) esm3_client_folded_protein = esm3_client.generate(protein, config) assert isinstance( esm3_client_folded_protein, ESMProtein ), f"Using ESM3 client, ESMProtein was expected but got {protein}" + # Folding with folding client sequence_structure_client_folded_protein = sequence_structure_client.fold( protein.sequence, potential_sequence_of_concern=False @@ -45,8 +62,14 @@ def main( assert isinstance( sequence_structure_client_folded_protein, ESMProtein ), f"Using sequence_structure client, ESMProtein was expected but got {sequence_structure_client_folded_protein}" + sequence_structure_client_folded_protein.to_pdb("folded_protein.pdb") + print("Saving folded protein to folded_protein.pdb") - # Inverse Folding with esm3 client + +def inverse_fold( + sequence_structure_client: SequenceStructureForgeInferenceClient, + esm3_client: ESM3InferenceClient, +): protein = get_sample_protein() protein.sequence = None protein.sasa = None @@ -54,21 +77,38 @@ def main( assert ( protein.coordinates is not None ), "Protein coordinates must be set to inverse fold" - config = GenerationConfig("sequence", num_steps=1, temperature=0.7) + + # Inverse Folding with esm3 client + config = GenerationConfig("sequence", num_steps=1, temperature=0.1) esm3_client_inv_folded_protein = cast( ESMProtein, esm3_client.generate(protein, config) ) assert isinstance( esm3_client_inv_folded_protein, ESMProtein ), f"Using ESM3 client, ESMProtein was expected but got {protein}" + # Inverse Folding with inverse folding client sequence_structure_client_inv_folded_protein = ( sequence_structure_client.inverse_fold( protein.coordinates, - config=InverseFoldingConfig(temperature=0.7), + config=InverseFoldingConfig(temperature=0.1), potential_sequence_of_concern=False, ) ) assert isinstance( sequence_structure_client_inv_folded_protein, ESMProtein ), f"Using sequence_structure client, ESMProtein was expected but got {sequence_structure_client_inv_folded_protein}" + print( + f"Inverse folded protein: {sequence_structure_client_inv_folded_protein.sequence}" + ) + + +if __name__ == "__main__": + if not os.environ.get("ESM_API_KEY", ""): + print("Please export your Forge API key as ESM_API_KEY environment variable.") + client = SequenceStructureForgeInferenceClient(token=os.environ["ESM_API_KEY"]) + esm3_client = ESM3ForgeInferenceClient( + model="esm3-medium-2024-08", token=os.environ["ESM_API_KEY"] + ) + fold(client, esm3_client) + inverse_fold(client, esm3_client) diff --git a/examples/esmprotein.ipynb b/cookbook/tutorials/1_esmprotein.ipynb similarity index 82% rename from examples/esmprotein.ipynb rename to cookbook/tutorials/1_esmprotein.ipynb index 38be2f2..fbe6d70 100644 --- a/examples/esmprotein.ipynb +++ b/cookbook/tutorials/1_esmprotein.ipynb @@ -44,8 +44,11 @@ "metadata": {}, "outputs": [], "source": [ - "# Install esm\n", - "! pip install esm" + "# Install esm and other dependencies\n", + "! pip install esm\n", + "! pip install py3Dmol\n", + "! pip install matplotlib\n", + "! pip install dna-features-viewer" ] }, { @@ -55,8 +58,8 @@ "outputs": [], "source": [ "from biotite.database import rcsb\n", - "from esm.utils.structure.protein_chain import ProteinChain\n", "from esm.sdk.api import ESMProtein\n", + "from esm.utils.structure.protein_chain import ProteinChain\n", "from esm.utils.types import FunctionAnnotation\n", "\n", "pdb_id = \"1cm4\"\n", @@ -140,23 +143,21 @@ "metadata": {}, "outputs": [], "source": [ - "\"\"\"\n", - "Functions for visualizing 3D structure\n", - "\"\"\"\n", - "\n", - "! pip install py3Dmol\n", + "# Functions for visualizing 3D structure\n", "\n", "import py3Dmol\n", "\n", + "\n", "def visualize_pdb(pdb_string):\n", " view = py3Dmol.view(width=400, height=400)\n", " view.addModel(pdb_string, \"pdb\")\n", - " view.setStyle({'cartoon': {'color': 'spectrum'}})\n", + " view.setStyle({\"cartoon\": {\"color\": \"spectrum\"}})\n", " view.zoomTo()\n", " view.render()\n", " view.center()\n", " return view\n", "\n", + "\n", "def visualize_3D_coordinates(coordinates):\n", " \"\"\"\n", " This uses all Alanines\n", @@ -166,6 +167,7 @@ " pdb_string = protein_with_same_coords.to_pdb_string()\n", " return visualize_pdb(pdb_string)\n", "\n", + "\n", "def visualize_3D_protein(protein):\n", " pdb_string = protein.to_pdb_string()\n", " return visualize_pdb(pdb_string)" @@ -210,17 +212,20 @@ "source": [ "from biotite.structure import annotate_sse\n", "\n", + "\n", "def get_approximate_ss(protein_chain: ProteinChain):\n", " # get biotite's ss3 representation\n", " ss3_arr = annotate_sse(protein_chain.atom_array)\n", - " biotite_ss3_str = ''.join(ss3_arr)\n", + " biotite_ss3_str = \"\".join(ss3_arr)\n", "\n", " # translate into ESM3's representation\n", - " translation_table = str.maketrans({\n", - " 'a': 'H', # alpha helix\n", - " 'b': 'E', # beta sheet\n", - " 'c': 'C', # coil\n", - " })\n", + " translation_table = str.maketrans(\n", + " {\n", + " \"a\": \"H\", # alpha helix\n", + " \"b\": \"E\", # beta sheet\n", + " \"c\": \"C\", # coil\n", + " }\n", + " )\n", " esm_ss3 = biotite_ss3_str.translate(translation_table)\n", " return esm_ss3" ] @@ -248,24 +253,22 @@ "metadata": {}, "outputs": [], "source": [ - "# ! pip install matplotlib\n", - "\n", "# Slightly modified version of secondary structure plotting code from\n", "# https://www.biotite-python.org/examples/gallery/structure/transketolase_sse.html\n", "# Code source: Patrick Kunzmann\n", "# License: BSD 3 clause\n", "\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "from matplotlib.patches import Rectangle\n", "import biotite\n", "import biotite.sequence as seq\n", "import biotite.sequence.graphics as graphics\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from matplotlib.patches import Rectangle\n", + "\n", "\n", "# Create 'FeaturePlotter' subclasses\n", "# for drawing the secondary structure features\n", "class HelixPlotter(graphics.FeaturePlotter):\n", - "\n", " def __init__(self):\n", " pass\n", "\n", @@ -281,12 +284,12 @@ " def draw(self, axes, feature, bbox, loc, style_param):\n", " # Approx. 1 turn per 3.6 residues to resemble natural helix\n", " n_turns = np.ceil((loc.last - loc.first + 1) / 3.6)\n", - " x_val = np.linspace(0, n_turns * 2*np.pi, 100)\n", + " x_val = np.linspace(0, n_turns * 2 * np.pi, 100)\n", " # Curve ranges from 0.3 to 0.7\n", - " y_val = (-0.4*np.sin(x_val) + 1) / 2\n", + " y_val = (-0.4 * np.sin(x_val) + 1) / 2\n", "\n", " # Transform values for correct location in feature map\n", - " x_val *= bbox.width / (n_turns * 2*np.pi)\n", + " x_val *= bbox.width / (n_turns * 2 * np.pi)\n", " x_val += bbox.x0\n", " y_val *= bbox.height\n", " y_val += bbox.y0\n", @@ -296,13 +299,10 @@ " bbox.p0, bbox.width, bbox.height, color=\"white\", linewidth=0\n", " )\n", " axes.add_patch(background)\n", - " axes.plot(\n", - " x_val, y_val, linewidth=2, color=biotite.colors[\"dimgreen\"]\n", - " )\n", + " axes.plot(x_val, y_val, linewidth=2, color=biotite.colors[\"dimgreen\"])\n", "\n", "\n", "class SheetPlotter(graphics.FeaturePlotter):\n", - "\n", " def __init__(self, head_width=0.8, tail_width=0.5):\n", " self._head_width = head_width\n", " self._tail_width = tail_width\n", @@ -316,36 +316,48 @@ "\n", " def draw(self, axes, feature, bbox, loc, style_param):\n", " x = bbox.x0\n", - " y = bbox.y0 + bbox.height/2\n", + " y = bbox.y0 + bbox.height / 2\n", " dx = bbox.width\n", " dy = 0\n", "\n", - " if loc.defect & seq.Location.Defect.MISS_RIGHT:\n", + " if loc.defect & seq.Location.Defect.MISS_RIGHT:\n", " # If the feature extends into the previous or next line\n", " # do not draw an arrow head\n", " draw_head = False\n", " else:\n", " draw_head = True\n", "\n", - " axes.add_patch(biotite.AdaptiveFancyArrow(\n", - " x, y, dx, dy,\n", - " self._tail_width*bbox.height, self._head_width*bbox.height,\n", - " # Create head with 90 degrees tip\n", - " # -> head width/length ratio = 1/2\n", - " head_ratio=0.5, draw_head=draw_head,\n", - " color=biotite.colors[\"orange\"], linewidth=0\n", - " ))\n", + " axes.add_patch(\n", + " biotite.AdaptiveFancyArrow(\n", + " x,\n", + " y,\n", + " dx,\n", + " dy,\n", + " self._tail_width * bbox.height,\n", + " self._head_width * bbox.height,\n", + " # Create head with 90 degrees tip\n", + " # -> head width/length ratio = 1/2\n", + " head_ratio=0.5,\n", + " draw_head=draw_head,\n", + " color=biotite.colors[\"orange\"],\n", + " linewidth=0,\n", + " )\n", + " )\n", + "\n", "\n", "# Converter for the DSSP secondary structure elements\n", "# to the classical ones\n", - "dssp_to_abc = {\"I\" : \"c\",\n", - " \"S\" : \"c\",\n", - " \"H\" : \"a\",\n", - " \"E\" : \"b\",\n", - " \"G\" : \"c\",\n", - " \"B\" : \"b\",\n", - " \"T\" : \"c\",\n", - " \"C\" : \"c\"}\n", + "dssp_to_abc = {\n", + " \"I\": \"c\",\n", + " \"S\": \"c\",\n", + " \"H\": \"a\",\n", + " \"E\": \"b\",\n", + " \"G\": \"c\",\n", + " \"B\": \"b\",\n", + " \"T\": \"c\",\n", + " \"C\": \"c\",\n", + "}\n", + "\n", "\n", "def visualize_secondary_structure(sse, first_id):\n", " \"\"\"\n", @@ -362,7 +374,7 @@ " # coil\n", " return\n", " feature = seq.Feature(\n", - " \"SecStr\", [seq.Location(first, last)], {\"sec_str_type\" : str_type}\n", + " \"SecStr\", [seq.Location(first, last)], {\"sec_str_type\": str_type}\n", " )\n", " annotation.add_feature(feature)\n", "\n", @@ -376,21 +388,23 @@ " curr_start = i\n", " curr_sse = sse[i]\n", " else:\n", - " if sse[i] != sse[i-1]:\n", + " if sse[i] != sse[i - 1]:\n", " _add_sec_str(\n", - " annotation, curr_start+first_id, i-1+first_id, curr_sse\n", + " annotation, curr_start + first_id, i - 1 + first_id, curr_sse\n", " )\n", " curr_start = i\n", " curr_sse = sse[i]\n", " # Add last secondary structure element to annotation\n", - " _add_sec_str(annotation, curr_start+first_id, i+first_id, curr_sse)\n", + " _add_sec_str(annotation, curr_start + first_id, i + first_id, curr_sse)\n", "\n", " fig = plt.figure(figsize=(30.0, 3.0))\n", " ax = fig.add_subplot(111)\n", " graphics.plot_feature_map(\n", - " ax, annotation, symbols_per_line=150,\n", - " loc_range=(first_id, first_id+len(sse)),\n", - " feature_plotters=[HelixPlotter(), SheetPlotter()]\n", + " ax,\n", + " annotation,\n", + " symbols_per_line=150,\n", + " loc_range=(first_id, first_id + len(sse)),\n", + " feature_plotters=[HelixPlotter(), SheetPlotter()],\n", " )\n", " fig.tight_layout()\n", " return fig, ax\n", @@ -440,10 +454,8 @@ "metadata": {}, "outputs": [], "source": [ - "from esm.utils.types import FunctionAnnotation\n", - "\n", "interpro_function_annotations = [\n", - " FunctionAnnotation(label=\"IPR050145\", start=1, end=142), # 1 indexed, inclusive;\n", + " FunctionAnnotation(label=\"IPR050145\", start=1, end=142), # 1 indexed, inclusive;\n", " FunctionAnnotation(label=\"IPR002048\", start=4, end=75),\n", " FunctionAnnotation(label=\"IPR002048\", start=77, end=144),\n", " FunctionAnnotation(label=\"IPR011992\", start=1, end=143),\n", @@ -467,21 +479,18 @@ "metadata": {}, "outputs": [], "source": [ - "\"\"\"\n", - "Functions for visualizing InterPro function annotations\n", - "\"\"\"\n", - "\n", - "! pip install dna-features-viewer\n", + "# Functions for visualizing InterPro function annotations\n", "\n", "from dna_features_viewer import GraphicFeature, GraphicRecord\n", - "from esm.utils.function.interpro import InterProEntryType, InterPro\n", + "from esm.utils.function.interpro import InterPro, InterProEntryType\n", "from matplotlib import colormaps\n", "\n", + "\n", "def visualize_function_annotations(\n", " annotations: list[FunctionAnnotation],\n", " sequence_length: int,\n", " ax: plt.Axes,\n", - " interpro_ = InterPro(),\n", + " interpro_=InterPro(),\n", "):\n", " cmap = colormaps[\"tab10\"]\n", " colors = [cmap(i) for i in range(len(InterProEntryType))]\n", @@ -498,7 +507,7 @@ " entry_type = InterProEntryType.UNKNOWN\n", "\n", " feature = GraphicFeature(\n", - " start=annotation.start - 1, # one index -> zero index\n", + " start=annotation.start - 1, # one index -> zero index\n", " end=annotation.end,\n", " label=label,\n", " color=type_colors[entry_type],\n", @@ -507,9 +516,7 @@ " features.append(feature)\n", "\n", " record = GraphicRecord(\n", - " sequence=None,\n", - " sequence_length=sequence_length,\n", - " features=features,\n", + " sequence=None, sequence_length=sequence_length, features=features\n", " )\n", "\n", " record.plot(figure_width=12, plot_sequence=False, ax=ax)" @@ -529,11 +536,7 @@ "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(20.0, 4.0))\n", - "visualize_function_annotations(\n", - " interpro_function_annotations,\n", - " len(protein),\n", - " ax,\n", - ")" + "visualize_function_annotations(interpro_function_annotations, len(protein), ax)" ] }, { @@ -553,6 +556,7 @@ "source": [ "from esm.tokenization import InterProQuantizedTokenizer\n", "\n", + "\n", "def get_keywords_from_interpro(\n", " interpro_annotations,\n", " interpro2keywords=InterProQuantizedTokenizer().interpro2keywords,\n", @@ -560,14 +564,16 @@ " keyword_annotations_list = []\n", " for interpro_annotation in interpro_annotations:\n", " keywords = interpro2keywords.get(interpro_annotation.label, [])\n", - " keyword_annotations_list.extend([\n", - " FunctionAnnotation(\n", - " label=keyword,\n", - " start=interpro_annotation.start,\n", - " end=interpro_annotation.end,\n", - " )\n", - " for keyword in keywords\n", - " ])\n", + " keyword_annotations_list.extend(\n", + " [\n", + " FunctionAnnotation(\n", + " label=keyword,\n", + " start=interpro_annotation.start,\n", + " end=interpro_annotation.end,\n", + " )\n", + " for keyword in keywords\n", + " ]\n", + " )\n", " return keyword_annotations_list" ] }, @@ -595,11 +601,7 @@ "outputs": [], "source": [ "fig, ax = plt.subplots(figsize=(20.0, 8.0))\n", - "visualize_function_annotations(\n", - " protein.function_annotations,\n", - " len(protein),\n", - " ax,\n", - ")" + "visualize_function_annotations(protein.function_annotations, len(protein), ax)" ] }, { @@ -656,30 +658,50 @@ "metadata": {}, "outputs": [], "source": [ - "from matplotlib import colormaps\n", - "\n", - "cmap = colormaps['cividis']\n", + "cmap = colormaps[\"cividis\"]\n", "clip_sasa_lower = 10\n", "clip_sasa_upper = 90\n", "\n", - "def plot_heatmap_legend(\n", - " cmap,\n", - " clip_sasa_lower,\n", - " clip_sasa_upper,\n", - "):\n", + "\n", + "def plot_heatmap_legend(cmap, clip_sasa_lower, clip_sasa_upper):\n", " gradient = np.linspace(0, 1, 256)\n", " gradient = np.vstack((gradient, gradient))\n", " _, ax = plt.subplots(figsize=(5, 0.3), dpi=350)\n", - " ax.imshow(gradient, aspect='auto', cmap=cmap)\n", - " ax.text(0.1, -0.3, f'{clip_sasa_lower} or lower', va='center', ha='right', fontsize=7, transform=ax.transAxes)\n", - " ax.text(0.5, -0.3, f'{(clip_sasa_lower + clip_sasa_upper) // 2}', va='center', ha='right', fontsize=7, transform=ax.transAxes)\n", - " ax.text(0.9, -0.3, f'{clip_sasa_upper} or higher', va='center', ha='left', fontsize=7, transform=ax.transAxes)\n", + " ax.imshow(gradient, aspect=\"auto\", cmap=cmap)\n", + " ax.text(\n", + " 0.1,\n", + " -0.3,\n", + " f\"{clip_sasa_lower} or lower\",\n", + " va=\"center\",\n", + " ha=\"right\",\n", + " fontsize=7,\n", + " transform=ax.transAxes,\n", + " )\n", + " ax.text(\n", + " 0.5,\n", + " -0.3,\n", + " f\"{(clip_sasa_lower + clip_sasa_upper) // 2}\",\n", + " va=\"center\",\n", + " ha=\"right\",\n", + " fontsize=7,\n", + " transform=ax.transAxes,\n", + " )\n", + " ax.text(\n", + " 0.9,\n", + " -0.3,\n", + " f\"{clip_sasa_upper} or higher\",\n", + " va=\"center\",\n", + " ha=\"left\",\n", + " fontsize=7,\n", + " transform=ax.transAxes,\n", + " )\n", " ax.set_xticklabels([])\n", " ax.set_yticklabels([])\n", " ax.set_xticks([])\n", " ax.set_yticks([])\n", " plt.show()\n", "\n", + "\n", "plot_heatmap_legend(cmap, clip_sasa_lower, clip_sasa_upper)" ] }, @@ -689,38 +711,33 @@ "metadata": {}, "outputs": [], "source": [ - "\"\"\"\n", - "Functions for visualizing SASA as colors on the 3D structure\n", - "\"\"\"\n", + "# Functions for visualizing SASA as colors on the 3D structure\n", "\n", - "def get_color_strings(\n", - " sasa,\n", - " clip_sasa_lower,\n", - " clip_sasa_upper,\n", - " cmap,\n", - "):\n", + "\n", + "def get_color_strings(sasa, clip_sasa_lower, clip_sasa_upper, cmap):\n", " transformed_sasa = np.clip(sasa, clip_sasa_lower, clip_sasa_upper)\n", - " transformed_sasa = (transformed_sasa - clip_sasa_lower) / (clip_sasa_upper - clip_sasa_lower)\n", + " transformed_sasa = (transformed_sasa - clip_sasa_lower) / (\n", + " clip_sasa_upper - clip_sasa_lower\n", + " )\n", " rgbas = (cmap(transformed_sasa) * 255).astype(int)\n", "\n", - " return [\n", - " f'rgb({rgba[0]},{rgba[1]},{rgba[2]})'\n", - " for rgba in rgbas\n", - " ] \n", + " return [f\"rgb({rgba[0]},{rgba[1]},{rgba[2]})\" for rgba in rgbas]\n", + "\n", "\n", "def visualize_sasa_3D_protein(\n", - " protein,\n", - " clip_sasa_lower=clip_sasa_lower,\n", - " clip_sasa_upper=clip_sasa_upper,\n", - " cmap=cmap,\n", + " protein, clip_sasa_lower=clip_sasa_lower, clip_sasa_upper=clip_sasa_upper, cmap=cmap\n", "):\n", " pdb_string = protein.to_pdb_string()\n", " plot_heatmap_legend(cmap, clip_sasa_lower, clip_sasa_upper)\n", " view = py3Dmol.view(width=400, height=400)\n", " view.addModel(pdb_string, \"pdb\")\n", "\n", - " for res_pos, res_color in enumerate(get_color_strings(protein.sasa, clip_sasa_lower, clip_sasa_upper, cmap)):\n", - " view.setStyle({'chain': 'A', 'resi': res_pos+1}, {'cartoon': {'color': res_color}})\n", + " for res_pos, res_color in enumerate(\n", + " get_color_strings(protein.sasa, clip_sasa_lower, clip_sasa_upper, cmap)\n", + " ):\n", + " view.setStyle(\n", + " {\"chain\": \"A\", \"resi\": res_pos + 1}, {\"cartoon\": {\"color\": res_color}}\n", + " )\n", " view.zoomTo()\n", " view.render()\n", " view.center()\n", diff --git a/cookbook/tutorials/2_embed.ipynb b/cookbook/tutorials/2_embed.ipynb new file mode 100644 index 0000000..284b8cd --- /dev/null +++ b/cookbook/tutorials/2_embed.ipynb @@ -0,0 +1,342 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook we will see how to embed a batch of sequences using ESM C, as well as explore its different layers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set up Forge client for ESM C" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from getpass import getpass\n", + "\n", + "token = getpass(\"Token from Forge console: \")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from esm.sdk import client\n", + "\n", + "model = client(\n", + " model=\"esmc-300m-2024-12\", url=\"https://forge.evolutionaryscale.ai\", token=token\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Set up utilities for embedding sequences\n", + "\n", + "Since we're embedding a more than a few sequences, we're going to use an threaded async call to Forge and let Forge take care of batching and parallelization on the backend." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from concurrent.futures import ThreadPoolExecutor\n", + "from typing import Sequence\n", + "\n", + "from esm.sdk.api import (\n", + " ESM3InferenceClient,\n", + " ESMProtein,\n", + " ESMProteinError,\n", + " LogitsConfig,\n", + " LogitsOutput,\n", + " ProteinType,\n", + ")\n", + "\n", + "EMBEDDING_CONFIG = LogitsConfig(\n", + " sequence=True, return_embeddings=True, return_hidden_states=True\n", + ")\n", + "\n", + "\n", + "def embed_sequence(model: ESM3InferenceClient, sequence: str) -> LogitsOutput:\n", + " protein = ESMProtein(sequence=sequence)\n", + " protein_tensor = model.encode(protein)\n", + " output = model.logits(protein_tensor, EMBEDDING_CONFIG)\n", + " return output\n", + "\n", + "\n", + "def batch_embed(\n", + " model: ESM3InferenceClient, inputs: Sequence[ProteinType]\n", + ") -> Sequence[LogitsOutput]:\n", + " \"\"\"Forge supports auto-batching. So batch_embed() is as simple as running a collection\n", + " of embed calls in parallel using asyncio.\n", + " \"\"\"\n", + " with ThreadPoolExecutor() as executor:\n", + " futures = [\n", + " executor.submit(embed_sequence, model, protein) for protein in inputs\n", + " ]\n", + " results = []\n", + " for future in futures:\n", + " try:\n", + " results.append(future.result())\n", + " except Exception as e:\n", + " results.append(ESMProteinError(500, str(e)))\n", + " return results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Requesting a specific hidden layer\n", + "\n", + "ESM C 6B's hidden states are really large, so we only allow one specific layer to be requested per API call. This also works for other ESM C models, but it is required for ESM C 6B. \n", + "Refer to https://forge.evolutionaryscale.ai/console to find the number of hidden layers for each model. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ESMC_6B_EMBEDDING_CONFIG = LogitsConfig(return_hidden_states=True, ith_hidden_layer=55)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load dataset\n", + "\n", + "This dataset is taken from Muir, et al. 2024 [\"Evolutionary-Scale Enzymology Enables Biochemical Constant Prediction Across a Multi-Peaked Catalytic Landscape\"](https://doi.org/10.1101/2024.10.23.619915) which explores a model enzyme called Adenylate Kinase (ADK). Adenylate Kinase appears in many different organisms with different structural classes (referred to as its \"lid type\"). We'll embed this set of ADK sequences and see if we can recover known structural classes." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2025-01-15 08:09:22-- https://docs.google.com/uc?export=download&id=1SpOkL11MJxIgy99dqufvUNJuCiuhxuyg\n", + "Resolving docs.google.com (docs.google.com)... 142.251.32.46, 2607:f8b0:4005:811::200e\n", + "Connecting to docs.google.com (docs.google.com)|142.251.32.46|:443... connected.\n", + "HTTP request sent, awaiting response... 303 See Other\n", + "Location: https://drive.usercontent.google.com/download?id=1SpOkL11MJxIgy99dqufvUNJuCiuhxuyg&export=download [following]\n", + "--2025-01-15 08:09:22-- https://drive.usercontent.google.com/download?id=1SpOkL11MJxIgy99dqufvUNJuCiuhxuyg&export=download\n", + "Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 142.250.188.1, 2607:f8b0:4005:802::2001\n", + "Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|142.250.188.1|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 43132 (42K) [application/octet-stream]\n", + "Saving to: ‘adk.csv’\n", + "\n", + "adk.csv 100%[===================>] 42.12K --.-KB/s in 0.03s \n", + "\n", + "2025-01-15 08:09:24 (1.43 MB/s) - ‘adk.csv’ saved [43132/43132]\n", + "\n" + ] + } + ], + "source": [ + "!wget --no-check-certificate \"https://docs.google.com/uc?export=download&id=1SpOkL11MJxIgy99dqufvUNJuCiuhxuyg\" -O adk.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "\n", + "adk_path = \"adk.csv\"\n", + "df = pd.read_csv(adk_path)\n", + "df = df[[\"org_name\", \"sequence\", \"lid_type\", \"temperature\"]]\n", + "df = df[df[\"lid_type\"] != \"other\"] # drop one structural class for simplicity" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Retrying... Attempt 1 after 1.0s due to: (502, 'Failure in logits: {\"status\":\"error\",\"message\":\"Model unavailable - please retry\"}')\n", + "Retrying... Attempt 1 after 1.0s due to: (502, 'Failure in logits: {\"status\":\"error\",\"message\":\"Model unavailable - please retry\"}')\n", + "Retrying... Attempt 1 after 1.0s due to: (502, 'Failure in encode: {\"status\":\"error\",\"message\":\"Model unavailable - please retry\"}')\n", + "Retrying... Attempt 1 after 1.0s due to: (502, 'Failure in logits: {\"status\":\"error\",\"message\":\"Model unavailable - please retry\"}')\n", + "Retrying... Attempt 1 after 1.0s due to: (502, 'Failure in logits: {\"status\":\"error\",\"message\":\"Model unavailable - please retry\"}')\n", + "Retrying... Attempt 1 after 1.0s due to: (502, 'Failure in logits: {\"status\":\"error\",\"message\":\"Model unavailable - please retry\"}')\n" + ] + } + ], + "source": [ + "outputs = batch_embed(model, df[\"sequence\"].tolist())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([31, 960])\n" + ] + } + ], + "source": [ + "import torch\n", + "\n", + "# we'll summarize the embeddings using their mean across the sequence which allows us to compare embeddings\n", + "# for sequences of different lengths\n", + "all_mean_embeddings = [\n", + " torch.mean(output.hidden_states, dim=-2).squeeze() for output in outputs\n", + "]\n", + "\n", + "# now we have a list of tensors of [num_layers, hidden_size]\n", + "print(all_mean_embeddings[0].shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Examine the performance of different layer embeddings\n", + "\n", + "For this example, we're going to use PCA to visualize whether the embeddings separate our proteins by their structural class. To assess the quality of our PCA, we fit a K means classifier with three clusters, corresponding to the three structural classes of our enzyme, and compute the [rand index](https://en.wikipedia.org/wiki/Rand_index), a measure of the quality of the clustering." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.metrics import adjusted_rand_score\n", + "\n", + "N_KMEANS_CLUSTERS = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_embeddings_at_layer(all_mean_embeddings: torch.Tensor, layer_idx: int):\n", + " stacked_mean_embeddings = torch.stack(\n", + " [embedding[layer_idx, :] for embedding in all_mean_embeddings]\n", + " ).numpy()\n", + "\n", + " # project all the embeddings to 2D using PCA\n", + " pca = PCA(n_components=2)\n", + " pca.fit(stacked_mean_embeddings)\n", + " projected_mean_embeddings = pca.transform(stacked_mean_embeddings)\n", + "\n", + " # compute kmeans purity as a measure of how good the clustering is\n", + " kmeans = KMeans(n_clusters=N_KMEANS_CLUSTERS, random_state=0).fit(\n", + " projected_mean_embeddings\n", + " )\n", + " rand_index = adjusted_rand_score(df[\"lid_type\"], kmeans.labels_)\n", + "\n", + " # plot the clusters\n", + " plt.figure(figsize=(4, 4))\n", + " sns.scatterplot(\n", + " x=projected_mean_embeddings[:, 0],\n", + " y=projected_mean_embeddings[:, 1],\n", + " hue=df[\"lid_type\"],\n", + " )\n", + " plt.title(\n", + " f\"PCA of mean embeddings at layer {layer_idx}.\\nRand index: {rand_index:.2f}\"\n", + " )\n", + " plt.xlabel(\"PC 1\")\n", + " plt.ylabel(\"PC 2\")\n", + " plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plot_embeddings_at_layer(all_mean_embeddings, layer_idx=30)\n", + "plot_embeddings_at_layer(all_mean_embeddings, layer_idx=12)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see that the top principal components of layer 12 separate structural classes better than that of layer 30. Embed away! And keep in mind that different layers may be better or worse for your particular use-case." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "openesm", + "language": "python", + "name": "esmopen" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/gfp_design.ipynb b/cookbook/tutorials/3_gfp_design.ipynb similarity index 100% rename from examples/gfp_design.ipynb rename to cookbook/tutorials/3_gfp_design.ipynb diff --git a/examples/forge_generate.ipynb b/cookbook/tutorials/4_forge_generate.ipynb similarity index 99% rename from examples/forge_generate.ipynb rename to cookbook/tutorials/4_forge_generate.ipynb index e386dd5..2f622ec 100644 --- a/examples/forge_generate.ipynb +++ b/cookbook/tutorials/4_forge_generate.ipynb @@ -20,9 +20,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Imports\n", - "\n", - "If you're running in Colab, you probably want to get a GPU runtime first (Runtime > Change runtime type > T4 GPU).\n" + "# Imports\n" ] }, { diff --git a/cookbook/tutorials/README.md b/cookbook/tutorials/README.md new file mode 100644 index 0000000..f967c9d --- /dev/null +++ b/cookbook/tutorials/README.md @@ -0,0 +1,8 @@ +# ESM API Fundamentals + +A series of notebook tutorials that cover how to use the Forge API to use ESM3 to generate proteins and ESM C to embed proteins. + +* [Understanding the ESMProtein Class](./1_esmprotein.ipynb) +* [Embedding a sequence using ESM C](./2_embed.ipynb) +* [Generating a novel GFP with chain of thought generation using ESM3](./3_gfp_design.ipynb) +* [Advanced prompting with additional ESM3 tracks](./4_forge_generate.ipynb) diff --git a/esm/__init__.py b/esm/__init__.py index 98707ef..afbf4a0 100644 --- a/esm/__init__.py +++ b/esm/__init__.py @@ -1,2 +1,2 @@ -__version__ = "3.1.2" +__version__ = "3.1.3" diff --git a/esm/models/esmc.py b/esm/models/esmc.py index 0d3438f..0807a21 100644 --- a/esm/models/esmc.py +++ b/esm/models/esmc.py @@ -145,7 +145,7 @@ def forward( ), "sequence_id must be a boolean mask if Flash Attention is used" assert sequence_id.shape == (B, L) assert unpad_input is not None - x, indices, _, _, _ = unpad_input( # type: ignore + x, indices, *_ = unpad_input( # type: ignore x, sequence_id ) else: @@ -208,6 +208,12 @@ def logits( else contextlib.nullcontext(), ): output = self.forward(sequence_tokens=input.sequence) + assert output.hidden_states is not None + output.hidden_states = ( + output.hidden_states[config.ith_hidden_layer : config.ith_hidden_layer + 1] + if config.ith_hidden_layer != -1 + else output.hidden_states + ) return LogitsOutput( logits=ForwardTrackData( diff --git a/esm/models/function_decoder.py b/esm/models/function_decoder.py index b918798..c4f3299 100644 --- a/esm/models/function_decoder.py +++ b/esm/models/function_decoder.py @@ -268,9 +268,18 @@ def decode( keyword_logits[~where_decode, :] = -torch.inf if decode_keywords: keyword_preds = F.sigmoid(keyword_logits) >= keywords_threshold - outputs["function_keywords"] = self._preds_to_keywords( - keyword_preds.detach().cpu().numpy() + keywords = self._preds_to_keywords(keyword_preds.detach().cpu().numpy()) + keywords = merge_annotations( + keywords, merge_gap_max=annotation_gap_merge_max ) + if annotation_min_length is not None: + keywords = [ + annotation + for annotation in keywords + if annotation.end - annotation.start + 1 >= annotation_min_length + ] + + outputs["function_keywords"] = keywords return outputs diff --git a/esm/sdk/api.py b/esm/sdk/api.py index ebfe893..f77d32f 100644 --- a/esm/sdk/api.py +++ b/esm/sdk/api.py @@ -347,6 +347,7 @@ class LogitsConfig: # Embeddings. return_embeddings: bool = False return_hidden_states: bool = False + ith_hidden_layer: int = -1 @define diff --git a/esm/sdk/forge.py b/esm/sdk/forge.py index 13e8f45..c795591 100644 --- a/esm/sdk/forge.py +++ b/esm/sdk/forge.py @@ -1,5 +1,5 @@ -import asyncio import base64 +from concurrent.futures import ThreadPoolExecutor from functools import wraps from typing import Sequence from urllib.parse import urljoin @@ -95,7 +95,8 @@ def fold( return e return ESMProtein( - coordinates=maybe_tensor(data["coordinates"], convert_none_to_nan=True) + sequence=sequence, + coordinates=maybe_tensor(data["coordinates"], convert_none_to_nan=True), ) def inverse_fold( @@ -228,23 +229,18 @@ def batch_generate( """Forge supports auto-batching. So batch_generate() for the Forge client is as simple as running a collection of generate() in parallel using asyncio. """ - loop = asyncio.get_event_loop() - - async def _async_generate(): + with ThreadPoolExecutor() as executor: futures = [ - loop.run_in_executor(None, self.generate, protein, config) + executor.submit(self.generate, protein, config) for protein, config in zip(inputs, configs) ] - return await asyncio.gather(*futures, return_exceptions=True) - - results = loop.run_until_complete(_async_generate()) - - def _capture_exception(r): - if isinstance(r, BaseException) and not isinstance(r, ESMProteinError): - return ESMProteinError(500, str(r)) - return r - - return [_capture_exception(r) for r in results] + results = [] + for future in futures: + try: + results.append(future.result()) + except Exception as e: + results.append(ESMProteinError(500, str(e))) + return results def __generate_protein( self, input: ESMProtein, config: GenerationConfig @@ -530,6 +526,7 @@ def logits( "residue_annotations": config.residue_annotations, "return_embeddings": config.return_embeddings, "return_hidden_states": config.return_hidden_states, + "ith_hidden_layer": config.ith_hidden_layer, } request = {"model": self.model, "inputs": req, "logits_config": logits_config} diff --git a/esm/utils/constants/models.py b/esm/utils/constants/models.py index ea20342..456171d 100644 --- a/esm/utils/constants/models.py +++ b/esm/utils/constants/models.py @@ -10,6 +10,10 @@ ESMC_300M = "esmc_300m" +def forge_only_return_single_layer_hidden_states(model_name: str): + return model_name.startswith("esmc-6b") + + def model_is_locally_supported(x: str): return x in { ESM3_OPEN_SMALL, diff --git a/esm/utils/generation.py b/esm/utils/generation.py index 4222ea6..d4a6de3 100644 --- a/esm/utils/generation.py +++ b/esm/utils/generation.py @@ -604,7 +604,10 @@ def maybe_clone(x: torch.Tensor | None) -> torch.Tensor | None: tokens_dir["sasa"] = sasa_value probs = sasa_logits.softmax(dim=-1) - entropy = -(probs * sasa_logits.log_softmax(-1)).sum(-1) + # Note(tjia): sasa_logits can have -inf because of invalid ids, so + # probs * sasa_logits.log_softmax(-1) is nan. We need to set + # those positions to 0 to get the correct entropy value + entropy = -(torch.nan_to_num(probs * sasa_logits.log_softmax(-1))).sum(-1) track_sampling_metadata_dir["sasa"] = {"entropy": entropy} diff --git a/esm/utils/generation_test.py b/esm/utils/generation_test.py index 1041b6d..a8e8c21 100644 --- a/esm/utils/generation_test.py +++ b/esm/utils/generation_test.py @@ -15,7 +15,9 @@ @pytest.fixture() def esm3_remote_inference_client(): - model = _load_esm_model(ModelName.ESM3_TINY_DEV, distributed_model=False) + model = _load_esm_model( + ModelName.ESM3_TINY_DEV, distributed_model=False, load_function_decoder=False + ) client = ESM3RemoteModelInferenceClient( model, tokenizers=model.tokenizers, diff --git a/esm/utils/misc.py b/esm/utils/misc.py index 077bd24..1d2b08c 100644 --- a/esm/utils/misc.py +++ b/esm/utils/misc.py @@ -256,6 +256,14 @@ def merge_annotations( return merged +def replace_inf(data): + if data is None: + return None + array = np.array(data, dtype=np.float32, copy=False) + array = np.where(np.isinf(array), -1, array) + return array.tolist() + + def maybe_tensor(x, convert_none_to_nan: bool = False) -> torch.Tensor | None: if x is None: return None diff --git a/esm/utils/sampling.py b/esm/utils/sampling.py index 6611d71..68c5c86 100644 --- a/esm/utils/sampling.py +++ b/esm/utils/sampling.py @@ -181,7 +181,6 @@ def sample_logits( logits = top_p_logits(logits, top_p=top_p) temperature = _tensorize_like(temperature, logits) - batch_dims = logits.size()[:-1] logits = logits.reshape(-1, logits.shape[-1]) @@ -189,7 +188,7 @@ def sample_logits( # the /logits endpoint should receive unmodified logits if mask_logits_of_invalid_ids: mask = torch.ones_like(logits, dtype=torch.bool) - mask[:, valid_ids] = False + mask[..., valid_ids] = False logits[mask] = -torch.inf if torch.all(temperature == 0): @@ -279,7 +278,7 @@ def sample_sasa_logits( # the /logits endpoint should receive unmodified logits if mask_logits_of_invalid_ids: mask = torch.ones_like(logits, dtype=torch.bool) - mask[:, valid_ids] = False + mask[..., valid_ids] = False logits[mask] = -torch.inf sasa_probs = torch.nn.functional.softmax(logits, dim=-1) diff --git a/examples/forge_generate.py b/examples/forge_generate.py deleted file mode 100644 index ffd8d06..0000000 --- a/examples/forge_generate.py +++ /dev/null @@ -1,13 +0,0 @@ -import os -import sys - -from examples.local_generate import main -from esm.sdk import client - -if __name__ == "__main__": - if not os.environ.get("ESM_API_KEY", ""): - print("Please export your Forge API key as ESM_API_KEY environment variable.") - sys.exit(1) - - # Run Forge. - main(client()) diff --git a/pyproject.toml b/pyproject.toml index 0492b0e..6d9f75d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "esm" -version = "3.1.2" +version = "3.1.3" description = "EvolutionaryScale open model repository" readme = "README.md" requires-python = ">=3.10" diff --git a/tools/generate.ipynb b/tools/generate.ipynb index c4f2c4a..48eed79 100644 --- a/tools/generate.ipynb +++ b/tools/generate.ipynb @@ -2,111 +2,85 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, - "source": [ - "# ESM3 Generation Notebook\n", - "\n", - "This is the most flexible notebook for generating protein sequences using the ESM3 model.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "HC86rsLf-_Zt" + }, "source": [ - "### Setup\n", + "# Generation UI\n", "\n", - "Install dependencies and setup the colab environment for asyncio requests\n" + "This is the most flexible notebook for generating protein sequences using the ESM3 model." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "cellView": "form", + "id": "ICGSD1Jo7zAb" + }, "outputs": [], "source": [ - "!pip install git+https://github.com/evolutionaryscale/esm\n", - "!pip install pydssp pygtrie dna-features-viewer nest_asyncio py3dmol" + "# @title Input API keys, then hit `Runtime` -> `Run all`\n", + "# @markdown Our hosted service that provides access to the full suite of ESM3 models.\n", + "# @markdown To utilize the Forge API, users must first agree to the [Terms of Service](https://forge.evolutionaryscale.ai/termsofservice) and generate an access token via the [Forge console](https://forge.evolutionaryscale.ai/console).\n", + "# @markdown The console also provides a comprehensive list of models available to each user.\n", + "\n", + "import os\n", + "\n", + "# @markdown ### Authentication\n", + "# @markdown Paste your token from the [Forge console](https://forge.evolutionaryscale.ai/console)\n", + "forge_token = \"\" # @param {type:\"string\"}\n", + "os.environ[\"FORGE_TOKEN\"] = forge_token\n", + "\n", + "# @markdown ### Model Selection\n", + "# @markdown Enter the model name from the [Forge console page](https://forge.evolutionaryscale.ai/console) that you would like to use:\n", + "model_name = \"esm3-medium-2024-03\" # @param {type:\"string\"}" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "cellView": "form", + "id": "03ARpZRE_N39" + }, "outputs": [], "source": [ - "import nest_asyncio\n", + "# @title Install dependencies\n", + "import os\n", "\n", - "nest_asyncio.apply()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Inference Settings\n" + "os.system(\"pip install git+https://github.com/evolutionaryscale/esm\")\n", + "os.system(\"pip install pydssp pygtrie dna-features-viewer py3dmol\")" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "cellView": "form", + "id": "x1MUAuDWBAel" + }, "outputs": [], "source": [ + "# @title Create Generation UI\n", + "# @markdown If running on Google colab, it is recommended to use the light theme and select the \"View output fullscreen\" option in the cell toolbar for the best experience\n", + "\n", + "from functools import partial\n", + "\n", + "from esm.widgets.utils.clients import get_forge_client\n", "from esm.widgets.utils.types import ClientInitContainer\n", "from esm.widgets.views.generation import create_generation_ui\n", - "from esm.widgets.views.login import create_login_ui" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client_init = ClientInitContainer()\n", - "create_login_ui(client_init)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We need to log into huggingface if using the model locally\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from esm.utils.misc import huggingfacehub_login\n", - "\n", - "if client_init.metadata[\"inference_option\"] == \"Local\":\n", - " huggingfacehub_login()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Generation UI\n", "\n", - "If running on Google colab, it is recommended to use the light theme and select the \"View output fullscreen\" option in the cell toolbar for the best experience\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client = client_init()\n", - "create_generation_ui(client)" + "client_container = ClientInitContainer()\n", + "client_container.client_init_callback = partial(get_forge_client, model_name)\n", + "create_generation_ui(client_container)" ] } ], "metadata": { + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", @@ -126,5 +100,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 0 } diff --git a/tools/invfold.ipynb b/tools/invfold.ipynb index 07ab941..d825c30 100644 --- a/tools/invfold.ipynb +++ b/tools/invfold.ipynb @@ -2,111 +2,176 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "4l-TA3Od1JFs" + }, "source": [ "# ESM3 Inverse Folding Notebook\n", "\n", "This notebook is intended to be used as a tool for inverse folding using the ESM3 model.\n" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Setup\n", - "\n", - "Install dependencies and setup the colab environment for asyncio requests\n" - ] - }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "cellView": "form", + "id": "1TwEAW_LSNZZ" + }, "outputs": [], "source": [ - "!pip install git+https://github.com/evolutionaryscale/esm\n", - "!pip install pydssp pygtrie dna-features-viewer nest_asyncio py3dmol" + "# @title Input API keys, then hit `Runtime` -> `Run all`\n", + "# @markdown Our hosted service that provides access to the full suite of ESM3 models.\n", + "# @markdown To utilize the Forge API, users must first agree to the [Terms of Service](https://forge.evolutionaryscale.ai/termsofservice) and generate an access token via the [Forge console](https://forge.evolutionaryscale.ai/console).\n", + "# @markdown The console also provides a comprehensive list of models available to each user.\n", + "\n", + "import os\n", + "\n", + "# @markdown ### Authentication\n", + "# @markdown Paste your token from the [Forge console](https://forge.evolutionaryscale.ai/console)\n", + "forge_token = \"\" # @param {type:\"string\"}\n", + "os.environ[\"ESM_API_KEY\"] = forge_token\n", + "\n", + "# @markdown ### Model Selection\n", + "# @markdown Enter the model name from the [Forge console page](https://forge.evolutionaryscale.ai/console) that you would like to use:\n", + "model_name = \"esm3-medium-2024-08\" # @param {type:\"string\"}\n", + "\n", + "# @markdown ### Input Structure\n", + "pdb_code = \"\" # @param {type:\"string\"}\n", + "chain = \"detect\" # @param {type:\"string\"}\n", + "# @markdown Enter PDB code or leave blank to upload file\n", + "# @markdown Specify a chain if uploading a complex\n", + "\n", + "# @markdown ### Design Parameters\n", + "temperature = 0.1 # @param {type:\"slider\", min:0.0, max:1.0, step:0.01}\n", + "num_sequences = 8 # @param {type:\"integer\"}" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "cellView": "form", + "id": "_942E63WS8-U" + }, "outputs": [], "source": [ - "import nest_asyncio\n", + "# @title Install dependencies\n", + "import os\n", + "\n", + "os.system(\"pip install git+https://github.com/evolutionaryscale/esm\")\n", + "os.system(\n", + " \"pip install pydssp pygtrie dna-features-viewer py3dmol nest-asyncio ipywidgets\"\n", + ")\n", + "\n", + "import nest_asyncio # noqa: E402\n", "\n", "nest_asyncio.apply()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Inference Settings\n" - ] - }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "cellView": "form", + "id": "jXl61b-zTIsp" + }, "outputs": [], "source": [ - "from esm.widgets.utils.types import ClientInitContainer\n", - "from esm.widgets.views.inverse_folding import create_inverse_folding_ui\n", - "from esm.widgets.views.login import create_login_ui" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client_init = ClientInitContainer()\n", - "create_login_ui(client_init)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We need to log into huggingface if using the model locally\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from esm.utils.misc import huggingfacehub_login\n", + "# @title Run Inverse Folding\n", + "import numpy as np\n", + "from esm.sdk.api import ESMProtein, ESMProteinError, GenerationConfig\n", + "from esm.widgets.utils.clients import get_forge_client\n", + "from google.colab import files\n", + "from IPython.display import HTML\n", "\n", - "if client_init.metadata[\"inference_option\"] == \"Local\":\n", - " huggingfacehub_login()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Inverse Folding UI\n", "\n", - "If running on Google colab, it is recommended to use the light theme and select the \"View output fullscreen\" option in the cell toolbar for the best experience.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client = client_init()\n", - "create_inverse_folding_ui(client)" + "def get_pdb(pdb_code=\"\"):\n", + " if pdb_code is None or pdb_code == \"\":\n", + " upload_dict = files.upload()\n", + " pdb_string = upload_dict[list(upload_dict.keys())[0]]\n", + " with open(\"tmp.pdb\", \"wb\") as out:\n", + " out.write(pdb_string)\n", + " return \"tmp.pdb\"\n", + " else:\n", + " os.system(f\"wget -qnc https://files.rcsb.org/view/{pdb_code}.pdb\")\n", + " return f\"{pdb_code}.pdb\"\n", + "\n", + "\n", + "print(\"Loading structure...\")\n", + "pdb_path = get_pdb(pdb_code)\n", + "\n", + "# Create protein object\n", + "protein = ESMProtein.from_pdb(pdb_path, chain_id=chain)\n", + "protein.sequence = None\n", + "\n", + "print(\"Running inverse folding...\")\n", + "client = get_forge_client(model_name)\n", + "generations = client.batch_generate(\n", + " inputs=[protein] * num_sequences,\n", + " configs=[GenerationConfig(track=\"sequence\", temperature=temperature)]\n", + " * num_sequences,\n", + ")\n", + "\n", + "if isinstance(protein, ESMProteinError):\n", + " raise RuntimeError(f\"Error: {str(protein)}\")\n", + "\n", + "errors: list[ESMProteinError] = []\n", + "sequences: list[str] = []\n", + "for i, protein in enumerate(generations):\n", + " if isinstance(protein, ESMProteinError):\n", + " errors.append((i, protein))\n", + " else:\n", + " sequences.append(protein.sequence)\n", + "\n", + "\n", + "def calculate_conservation_scores(sequences: list[str]) -> np.ndarray:\n", + " array = np.array([list(seq) for seq in sequences], dtype=\"S1\")\n", + " array = array.view(np.uint8) - ord(\"A\")\n", + "\n", + " # Create a 2D array of counts\n", + " max_range = 26\n", + " counts = np.zeros((max_range + 1, array.shape[1]), dtype=int)\n", + " for col in range(array.shape[1]):\n", + " count = np.bincount(array[:, col], minlength=max_range + 1)\n", + " counts[:, col] = count\n", + " counts = counts.T\n", + "\n", + " # Calculate entropy (-sum(p log p))\n", + " probabilities = counts / counts.sum(axis=1, keepdims=True)\n", + " entropy = -np.sum(probabilities * np.log(probabilities + 1e-9), axis=1)\n", + "\n", + " # Convert to conservation score (1 - normalized entropy)\n", + " max_entropy = np.log(256)\n", + " # Magic constant to make displaying non-conserved residues more apparent\n", + " conservation_scores = np.maximum(0, 0.5 - (entropy / max_entropy)) / 0.5\n", + "\n", + " return conservation_scores\n", + "\n", + "\n", + "def display_sequences(sequences: list[str]):\n", + " conservation_scores = calculate_conservation_scores(sequences)\n", + " html_output = '
'\n",
+    "    for sequence in sequences:\n",
+    "        for j, residue in enumerate(sequence):\n",
+    "            # Add padding for alignment and color the background\n",
+    "            html_output += f'{residue}'\n",
+    "        html_output += \"
\"\n", + " html_output += \"
\"\n", + " display(HTML(html_output))\n", + "\n", + "\n", + "display_sequences(sequences)\n", + "\n", + "for i, error in errors:\n", + " print(f\"Error code {error.error_code} at index {i}: {error.error_msg}\")" ] } ], "metadata": { + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", @@ -114,5 +179,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 0 } diff --git a/tools/predict.ipynb b/tools/predict.ipynb index c098d55..214529a 100644 --- a/tools/predict.ipynb +++ b/tools/predict.ipynb @@ -2,111 +2,141 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "wO0XaARp1Ghc" + }, "source": [ "# ESM3 Prediction Notebook\n", "\n", "This notebook is intended to be used as a tool for quick and easy protein property prediction using the ESM3 model.\n" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Setup\n", - "\n", - "Install dependencies and setup the colab environment for asyncio requests\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install git+https://github.com/evolutionaryscale/esm\n", - "!pip install pydssp pygtrie dna-features-viewer nest_asyncio py3dmol" - ] - }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "cellView": "form", + "id": "0zITyTcwKK2o" + }, "outputs": [], "source": [ - "import nest_asyncio\n", + "# @title Input API keys, then hit `Runtime` -> `Run all`\n", + "# @markdown Our hosted service that provides access to the full suite of ESM3 models.\n", + "# @markdown To utilize the Forge API, users must first agree to the [Terms of Service](https://forge.evolutionaryscale.ai/termsofservice) and generate an access token via the [Forge console](https://forge.evolutionaryscale.ai/console).\n", + "# @markdown The console also provides a comprehensive list of models available to each user.\n", "\n", - "nest_asyncio.apply()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Inference Settings\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from esm.widgets.utils.types import ClientInitContainer\n", - "from esm.widgets.views.login import create_login_ui\n", - "from esm.widgets.views.prediction import create_prediction_ui" + "import os\n", + "\n", + "# @markdown ### Authentication\n", + "# @markdown Paste your token from the [Forge console](https://forge.evolutionaryscale.ai/console)\n", + "forge_token = \"\" # @param {type:\"string\"}\n", + "os.environ[\"ESM_API_KEY\"] = forge_token\n", + "\n", + "# @markdown ### Model Selection\n", + "# @markdown Enter the model name from the [Forge console page](https://forge.evolutionaryscale.ai/console) that you would like to use:\n", + "model_name = \"esm3-medium-2024-08\" # @param {type:\"string\"}\n", + "\n", + "# markdown ### Sequence\n", + "# @markdown Please use '|' to delimit a multimer sequence.\n", + "sequence = \"MSHHWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSVSYDQATSLRILNNGHAFNVEFDDSQDKAVLKGGPLDGTYRLIQFHFHWGSLDGQGSEHTVDKKKYAAELHLVHWNTKYGDFGKAVQQPDGLAVLGIFLKVGSAKPGLQKVVDVLDSIKTKGKSADFTNFDPRGLLPESLDYWTYPGSLTTPPLLECVTWIVLKEPISVSSEQVLKFRKLNFNGEGEPEELMVDNWRPAQPLKNRQIKASFK\" # @param {type:\"string\"}" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "cellView": "form", + "id": "CryS18DaKgjP" + }, "outputs": [], "source": [ - "client_init = ClientInitContainer()\n", - "create_login_ui(client_init)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We need to log into huggingface if using the model locally\n" + "# @title Install dependencies\n", + "import os\n", + "\n", + "os.system(\"pip install git+https://github.com/evolutionaryscale/esm\")\n", + "os.system(\"pip install pydssp pygtrie dna-features-viewer py3dmol ipywidgets\")" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "cellView": "form", + "id": "ej6cllESKj5S" + }, "outputs": [], "source": [ - "from esm.utils.misc import huggingfacehub_login\n", + "# @title Run Prediction and Display Results\n", + "from esm.sdk.api import ESMProtein, ESMProteinError, GenerationConfig\n", + "from esm.widgets.components.results_visualizer import create_results_visualizer\n", + "from esm.widgets.utils.clients import get_forge_client\n", + "from ipywidgets import widgets\n", "\n", - "if client_init.metadata[\"inference_option\"] == \"Local\":\n", - " huggingfacehub_login()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prediction UI\n", + "# Initialize client\n", + "client = get_forge_client(model_name)\n", "\n", - "If running on Google colab, it is recommended to use the light theme and select the \"View output fullscreen\" option in the cell toolbar for the best experience.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client = client_init()\n", - "create_prediction_ui(client)" + "# Create protein object\n", + "protein = ESMProtein(sequence=sequence)\n", + "\n", + "# Predict all tracks\n", + "tracks = [\"structure\", \"secondary_structure\", \"sasa\", \"function\"]\n", + "\n", + "output = widgets.Output()\n", + "display(output)\n", + "with output:\n", + " print(\"Starting predictions...\")\n", + "\n", + " for track in tracks:\n", + " print(f\"Predicting {track}...\")\n", + " protein = client.generate(\n", + " protein, config=GenerationConfig(track=track, temperature=0.01)\n", + " )\n", + " if isinstance(protein, ESMProteinError):\n", + " raise RuntimeError(f\"Error: {str(protein)}\")\n", + "\n", + " # Create result visualizers\n", + " structure_results = create_results_visualizer(\n", + " modality=\"structure\", samples=[protein], items_per_page=1, include_title=False\n", + " )\n", + "\n", + " secondary_structure_results = create_results_visualizer(\n", + " modality=\"secondary_structure\",\n", + " samples=[protein],\n", + " items_per_page=1,\n", + " include_title=False,\n", + " )\n", + "\n", + " sasa_results = create_results_visualizer(\n", + " modality=\"sasa\", samples=[protein], items_per_page=1, include_title=False\n", + " )\n", + "\n", + " function_results = create_results_visualizer(\n", + " modality=\"function\", samples=[protein], items_per_page=1, include_title=False\n", + " )\n", + "\n", + " output.clear_output(wait=True)\n", + "\n", + " # Create tabbed interface\n", + " results_ui = widgets.Tab(\n", + " children=[\n", + " structure_results,\n", + " secondary_structure_results,\n", + " sasa_results,\n", + " function_results,\n", + " ]\n", + " )\n", + " results_ui.set_title(0, \"Structure\")\n", + " results_ui.set_title(1, \"Secondary Structure\")\n", + " results_ui.set_title(2, \"SASA\")\n", + " results_ui.set_title(3, \"Function\")\n", + " display(results_ui)" ] } ], "metadata": { + "colab": { + "provenance": [] + }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", @@ -114,5 +144,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 0 }