-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #43 from monarch-initiative/uniprot-loader
uniprot loader
- Loading branch information
Showing
8 changed files
with
242 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
"""Chat with a KB.""" | ||
import logging | ||
import time | ||
from dataclasses import dataclass | ||
from typing import ClassVar, List, Optional, Dict | ||
|
||
import inflection | ||
import requests | ||
import yaml | ||
|
||
from curate_gpt.wrappers.base_wrapper import BaseWrapper | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
BASE_URL = "http://wwwdev.ebi.ac.uk/Tools/omicsdi/ws" | ||
|
||
|
||
@dataclass | ||
class OmicsDIWrapper(BaseWrapper): | ||
|
||
""" | ||
A wrapper to provide a search facade over OMICS DI Search. | ||
This is a dynamic wrapper: it can be used as a search facade, | ||
but cannot be ingested in whole. | ||
""" | ||
|
||
name: ClassVar[str] = "omicsdi" | ||
|
||
default_object_type = "Dataset" | ||
|
||
source: str = None # pride, ... | ||
|
||
def external_search(self, text: str, expand: bool = False, where: Optional[Dict] = None, **kwargs) -> List[Dict]: | ||
""" | ||
Search the OmicsDI database for the given text. | ||
TODO: full text retrieval doesn't seem to work well | ||
""" | ||
params = { | ||
"size": 20, | ||
"faceCount": 30, | ||
} | ||
if expand: | ||
def qt(t: str): | ||
t = t.strip() | ||
if " " in t: | ||
return f'"{t}"' | ||
return t | ||
logger.info(f"Expanding search term: {text} to create OmicsDI query") | ||
model = self.extractor.model | ||
response = model.prompt( | ||
text, system=""" | ||
Take the specified search text, and expand it to a list | ||
of key terms used to construct a query. You will return results as | ||
semi-colon separated list of the most relevant terms. Make sure to | ||
include all relevant concepts in the returned terms.""" | ||
) | ||
terms = response.text().split(";") | ||
logger.info(f"Expanded terms: {terms}") | ||
terms = [qt(t) for t in terms] | ||
# terms = terms[0:2] | ||
search_term = " ".join(terms) | ||
else: | ||
search_term = text | ||
params["query"] = search_term | ||
if where: | ||
params.update(where) | ||
time.sleep(0.25) | ||
logger.info(f"Constructed query: {params}") | ||
url = f"{BASE_URL}/dataset/search" | ||
logger.info(f"Searching OmicsDI {url} with query: {params}") | ||
response = requests.get(url, params=params) | ||
data = response.json() | ||
|
||
datasets = data["datasets"] | ||
logger.info(f"Found {len(datasets)} datasets") | ||
for dataset in datasets: | ||
dataset["additional"] = self.additional_info(dataset["id"], dataset["source"]) | ||
return datasets | ||
|
||
def additional_info(self, local_id: str, source: str): | ||
""" | ||
Augment the local ID with information from the database. | ||
""" | ||
url = f"{BASE_URL}/dataset/{source}/{local_id}" | ||
logger.info(f"Getting additional info from {url}") | ||
response = requests.get(url) | ||
response.raise_for_status() | ||
data = response.json() | ||
return data["additional"] | ||
|
||
def objects_by_ids(self, ids: List[str], source: str = None): | ||
""" | ||
Augment the local ID with information from the database. | ||
""" | ||
source = source or self.source | ||
datas = [] | ||
for id in ids: | ||
if ":" in id: | ||
source, local_id = id.split(":") | ||
else: | ||
local_id = id | ||
if not source: | ||
raise ValueError(f"Need a source for ID {id}") | ||
source = source.lower() | ||
url = f"{BASE_URL}/dataset/{source}/{local_id}" | ||
logger.info(f"Getting additional info from {url}") | ||
response = requests.get(url) | ||
response.raise_for_status() | ||
data = response.json() | ||
datas.append(data) | ||
|
||
return datas | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
"""Chat with a KB.""" | ||
import logging | ||
from dataclasses import dataclass, field | ||
from typing import ClassVar, Dict, Iterable, Iterator, Optional, List | ||
|
||
import requests | ||
import requests_cache | ||
from oaklib import BasicOntologyInterface | ||
|
||
from curate_gpt.wrappers import BaseWrapper | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
BASE_URL = "https://rest.uniprot.org/uniprotkb" | ||
|
||
|
||
@dataclass | ||
class UniprotWrapper(BaseWrapper): | ||
|
||
""" | ||
A wrapper over the UniProt API. | ||
""" | ||
|
||
name: ClassVar[str] = "uniprot" | ||
|
||
_label_adapter: BasicOntologyInterface = None | ||
|
||
default_object_type = "Protein" | ||
|
||
#taxon_id: str = field(default="NCBITaxon:9606") | ||
taxon_id: Optional[str] = None | ||
|
||
session: requests_cache.CachedSession = field( | ||
default_factory=lambda: requests_cache.CachedSession("uniprot") | ||
) | ||
|
||
def objects( | ||
self, collection: str = None, object_ids: Iterable[str] = None, **kwargs | ||
) -> Iterator[Dict]: | ||
""" | ||
All proteins | ||
""" | ||
object_ids = object_ids or self.object_ids() | ||
return self.objects_by_ids(object_ids) | ||
|
||
def object_ids(self, taxon_id: str = None, **kwargs) -> Iterator[str]: | ||
""" | ||
Get all gene ids for a given taxon id | ||
:param taxon_id: | ||
:param kwargs: | ||
:return: | ||
""" | ||
url = f"{BASE_URL}/search" | ||
session = self.session | ||
taxon_id = taxon_id or self.taxon_id | ||
if not taxon_id: | ||
raise ValueError("Taxon ID is required") | ||
taxon_id = str(taxon_id).replace("NCBITaxon:", "") | ||
params = { | ||
f"query": f"organism_id:{taxon_id} AND reviewed:true", | ||
# Query for E. coli using NCBI taxon ID and reviewed (Swiss-Prot) proteins | ||
"format": "json", # Response format | ||
"size": 500, # Number of results per page (max 500) | ||
"fields": "accession,id" # Fields to retrieve | ||
} | ||
|
||
# Send the request | ||
logger.info(f"Getting proteins for taxon {taxon_id}") | ||
response = session.get(url, params=params) | ||
response.raise_for_status() | ||
data = response.json() | ||
entries = data.get("results", []) | ||
for entry in entries: | ||
logger.debug(f"Got entry: {entry}") | ||
yield entry["primaryAccession"] | ||
|
||
def objects_by_ids(self, object_ids: List[str]) -> List[Dict]: | ||
session = self.session | ||
objs = [] | ||
for object_id in object_ids: | ||
if ":" in object_id: | ||
pfx, object_id = object_id.split(":")[1] | ||
if pfx.lower() not in ["uniprot", "uniprotkb"]: | ||
raise ValueError(f"Invalid object id prefix: {pfx}") | ||
url = f"{BASE_URL}/{object_id}.json" | ||
logger.info(f"Getting protein data for {object_id} from {url}") | ||
response = session.get(url) | ||
response.raise_for_status() | ||
data = response.json() | ||
objs.append(data) | ||
return objs | ||
|
||
|
||
|
||
|
||
|
||
|
Empty file.