Skip to content

Commit

Permalink
Merge pull request #43 from monarch-initiative/uniprot-loader
Browse files Browse the repository at this point in the history
uniprot loader
  • Loading branch information
cmungall authored Aug 2, 2024
2 parents 244e39d + 27ec399 commit 897ec5a
Show file tree
Hide file tree
Showing 8 changed files with 242 additions and 14 deletions.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ load-bacdive:
load-cdr:
$(CURATE) -v index -p $(DB_PATH) -V bioc -c cdr_test -m openai: data/CDR_TestSet.BioC.xml.gz

load-uniprot-%:
$(CURATE) -v index -p $(DB_PATH) -V uniprot -c uniprot_$* -m openai: --view-settings "taxon_id: $*"

## -- GitHub issues --

# TODO: patternize
Expand Down
2 changes: 1 addition & 1 deletion src/curate_gpt/agents/chat_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def chat(
kwargs["collection"] = collection
kb_results = list(
self.knowledge_source.search(
query, relevance_factor=self.relevance_factor, limit=limit, **kwargs
query, relevance_factor=self.relevance_factor, limit=limit, expand=expand, **kwargs
)
)
while True:
Expand Down
2 changes: 1 addition & 1 deletion src/curate_gpt/agents/evidence_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,6 @@ def _add_evidence(input_obj: dict) -> dict:
if isinstance(sub_obj, dict):
_add_evidence(sub_obj)
elif isinstance(v, dict):
_add_evidence(sub_obj)
_add_evidence(v)
logger.info(f"Found {len(new_evidences)} evidence objects.")
return obj
32 changes: 20 additions & 12 deletions src/curate_gpt/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ def main(verbose: int, quiet: bool):
"-V",
help="View/Proxy to use for the database, e.g. bioc.",
)
@click.option("--view-settings", help="YAML settings for the view wrapper.")
@click.option(
"--glob/--no-glob", default=False, show_default=True, help="Whether to glob the files."
)
Expand Down Expand Up @@ -263,6 +264,7 @@ def index(
collect,
encoding,
remove_field,
view_settings,
**kwargs,
):
"""
Expand Down Expand Up @@ -296,7 +298,11 @@ def index(
if glob:
files = [str(gf.absolute()) for f in files for gf in Path().glob(f) if gf.is_file()]
if view:
wrapper = get_wrapper(view)
view_args = {}
if view_settings:
view_args = yaml.safe_load(view_settings)
logging.info(f"View settings: {view_args}")
wrapper = get_wrapper(view, **view_args)
if not object_type:
object_type = wrapper.default_object_type
if not description:
Expand All @@ -310,6 +316,8 @@ def index(
db.remove_collection(collection)
if model is None:
model = "openai:"
if not files and wrapper:
files = ["API"]
for file in files:
if encoding == "detect":
import chardet
Expand Down Expand Up @@ -1078,16 +1086,10 @@ def review(
Example:
-------
curategpt complete -c obo_go "umbelliferose biosynthetic process"
If the string looks like yaml (if it has a ':') then it will be parsed as yaml.
E.g
curategpt complete -c obo_go "label: umbelliferose biosynthetic process"
curategpt review -c obo_obi "{}" -Z definition -t patch \
--primary-key original_id --rule "make definitions simple and easy to read by domain scientists. \
At the same time, conform to genus-differentia style and OBO best practice."
Pass ``--extract-format`` to make the extractor use a different internal representation
when communicating to the LLM
"""
where_str = " ".join(where)
where_q = yaml.safe_load(where_str)
Expand Down Expand Up @@ -2303,13 +2305,19 @@ def view_index(
@click.option("--source-locator")
@limit_option
@model_option
@click.option(
"--expand/--no-expand",
default=True,
show_default=True,
help="Whether to expand the search term using an LLM.",
)
@click.argument("query")
def view_ask(query, view, model, limit, **kwargs):
def view_ask(query, view, model, limit, expand, **kwargs):
"""Ask a knowledge source wrapper."""
vstore: BaseWrapper = get_wrapper(view)
vstore.extractor = BasicExtractor(model_name=model)
chatbot = ChatAgent(knowledge_source=vstore)
response = chatbot.chat(query, limit=limit)
response = chatbot.chat(query, limit=limit, expand=expand)
show_chat_response(response, True)


Expand Down
4 changes: 4 additions & 0 deletions src/curate_gpt/wrappers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
"AllianceGeneWrapper",
"NCBIBiosampleWrapper",
"NCBIBioprojectWrapper",
"OmicsDIWrapper",
"UniprotWrapper",
"OBOFormatWrapper",
"ClinVarWrapper",
"PMCWrapper",
Expand Down Expand Up @@ -47,7 +49,9 @@ def get_wrapper(name: str, **kwargs) -> BaseWrapper:
from curate_gpt.wrappers.bio.bacdive_wrapper import BacDiveWrapper # noqa
from curate_gpt.wrappers.bio.gocam_wrapper import GOCAMWrapper # noqa
from curate_gpt.wrappers.bio.mediadive_wrapper import MediaDiveWrapper # noqa
from curate_gpt.wrappers.bio.omicsdi_wrapper import OmicsDIWrapper # noqa
from curate_gpt.wrappers.bio.reactome_wrapper import ReactomeWrapper # noqa
from curate_gpt.wrappers.bio.uniprot_wrapper import UniprotWrapper # noqa
from curate_gpt.wrappers.clinical.ctgov_wrapper import ClinicalTrialsWrapper # noqa
from curate_gpt.wrappers.clinical.clinvar_wrapper import ClinVarWrapper # noqa
from curate_gpt.wrappers.clinical.hpoa_by_pub_wrapper import HPOAByPubWrapper # noqa
Expand Down
115 changes: 115 additions & 0 deletions src/curate_gpt/wrappers/bio/omicsdi_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""Chat with a KB."""
import logging
import time
from dataclasses import dataclass
from typing import ClassVar, List, Optional, Dict

import inflection
import requests
import yaml

from curate_gpt.wrappers.base_wrapper import BaseWrapper

logger = logging.getLogger(__name__)

BASE_URL = "http://wwwdev.ebi.ac.uk/Tools/omicsdi/ws"


@dataclass
class OmicsDIWrapper(BaseWrapper):

"""
A wrapper to provide a search facade over OMICS DI Search.
This is a dynamic wrapper: it can be used as a search facade,
but cannot be ingested in whole.
"""

name: ClassVar[str] = "omicsdi"

default_object_type = "Dataset"

source: str = None # pride, ...

def external_search(self, text: str, expand: bool = False, where: Optional[Dict] = None, **kwargs) -> List[Dict]:
"""
Search the OmicsDI database for the given text.
TODO: full text retrieval doesn't seem to work well
"""
params = {
"size": 20,
"faceCount": 30,
}
if expand:
def qt(t: str):
t = t.strip()
if " " in t:
return f'"{t}"'
return t
logger.info(f"Expanding search term: {text} to create OmicsDI query")
model = self.extractor.model
response = model.prompt(
text, system="""
Take the specified search text, and expand it to a list
of key terms used to construct a query. You will return results as
semi-colon separated list of the most relevant terms. Make sure to
include all relevant concepts in the returned terms."""
)
terms = response.text().split(";")
logger.info(f"Expanded terms: {terms}")
terms = [qt(t) for t in terms]
# terms = terms[0:2]
search_term = " ".join(terms)
else:
search_term = text
params["query"] = search_term
if where:
params.update(where)
time.sleep(0.25)
logger.info(f"Constructed query: {params}")
url = f"{BASE_URL}/dataset/search"
logger.info(f"Searching OmicsDI {url} with query: {params}")
response = requests.get(url, params=params)
data = response.json()

datasets = data["datasets"]
logger.info(f"Found {len(datasets)} datasets")
for dataset in datasets:
dataset["additional"] = self.additional_info(dataset["id"], dataset["source"])
return datasets

def additional_info(self, local_id: str, source: str):
"""
Augment the local ID with information from the database.
"""
url = f"{BASE_URL}/dataset/{source}/{local_id}"
logger.info(f"Getting additional info from {url}")
response = requests.get(url)
response.raise_for_status()
data = response.json()
return data["additional"]

def objects_by_ids(self, ids: List[str], source: str = None):
"""
Augment the local ID with information from the database.
"""
source = source or self.source
datas = []
for id in ids:
if ":" in id:
source, local_id = id.split(":")
else:
local_id = id
if not source:
raise ValueError(f"Need a source for ID {id}")
source = source.lower()
url = f"{BASE_URL}/dataset/{source}/{local_id}"
logger.info(f"Getting additional info from {url}")
response = requests.get(url)
response.raise_for_status()
data = response.json()
datas.append(data)

return datas

98 changes: 98 additions & 0 deletions src/curate_gpt/wrappers/bio/uniprot_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""Chat with a KB."""
import logging
from dataclasses import dataclass, field
from typing import ClassVar, Dict, Iterable, Iterator, Optional, List

import requests
import requests_cache
from oaklib import BasicOntologyInterface

from curate_gpt.wrappers import BaseWrapper

logger = logging.getLogger(__name__)

BASE_URL = "https://rest.uniprot.org/uniprotkb"


@dataclass
class UniprotWrapper(BaseWrapper):

"""
A wrapper over the UniProt API.
"""

name: ClassVar[str] = "uniprot"

_label_adapter: BasicOntologyInterface = None

default_object_type = "Protein"

#taxon_id: str = field(default="NCBITaxon:9606")
taxon_id: Optional[str] = None

session: requests_cache.CachedSession = field(
default_factory=lambda: requests_cache.CachedSession("uniprot")
)

def objects(
self, collection: str = None, object_ids: Iterable[str] = None, **kwargs
) -> Iterator[Dict]:
"""
All proteins
"""
object_ids = object_ids or self.object_ids()
return self.objects_by_ids(object_ids)

def object_ids(self, taxon_id: str = None, **kwargs) -> Iterator[str]:
"""
Get all gene ids for a given taxon id
:param taxon_id:
:param kwargs:
:return:
"""
url = f"{BASE_URL}/search"
session = self.session
taxon_id = taxon_id or self.taxon_id
if not taxon_id:
raise ValueError("Taxon ID is required")
taxon_id = str(taxon_id).replace("NCBITaxon:", "")
params = {
f"query": f"organism_id:{taxon_id} AND reviewed:true",
# Query for E. coli using NCBI taxon ID and reviewed (Swiss-Prot) proteins
"format": "json", # Response format
"size": 500, # Number of results per page (max 500)
"fields": "accession,id" # Fields to retrieve
}

# Send the request
logger.info(f"Getting proteins for taxon {taxon_id}")
response = session.get(url, params=params)
response.raise_for_status()
data = response.json()
entries = data.get("results", [])
for entry in entries:
logger.debug(f"Got entry: {entry}")
yield entry["primaryAccession"]

def objects_by_ids(self, object_ids: List[str]) -> List[Dict]:
session = self.session
objs = []
for object_id in object_ids:
if ":" in object_id:
pfx, object_id = object_id.split(":")[1]
if pfx.lower() not in ["uniprot", "uniprotkb"]:
raise ValueError(f"Invalid object id prefix: {pfx}")
url = f"{BASE_URL}/{object_id}.json"
logger.info(f"Getting protein data for {object_id} from {url}")
response = session.get(url)
response.raise_for_status()
data = response.json()
objs.append(data)
return objs






Empty file.

0 comments on commit 897ec5a

Please sign in to comment.