From b19b20025147eeea9023a634a7e6b6e6647c7d89 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Wed, 9 Oct 2024 16:41:34 -0400 Subject: [PATCH 1/9] Correct name in pyproject.ml --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a6b97d1..2dbbf92 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,8 +75,8 @@ sphinx-click = {version = ">=4.3.0"} myst-parser = {version = ">=0.18.1"} [tool.poetry.scripts] -curategpt = "curate_gpt.cli:main" -gocampr = "curate_gpt.adhoc.gocam_predictor:main" +curategpt = "curategpt.cli:main" +gocampr = "curategpt.adhoc.gocam_predictor:main" [tool.poetry.extras] docs = [ From cb718f3c2b9fd6126ce3bb5eb039093e6726472c Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Wed, 9 Oct 2024 17:02:32 -0400 Subject: [PATCH 2/9] Expand docstrings for bootstrap commands --- src/curategpt/cli.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/curategpt/cli.py b/src/curategpt/cli.py index 6233e03..0a944cd 100644 --- a/src/curategpt/cli.py +++ b/src/curategpt/cli.py @@ -829,7 +829,32 @@ def extract_from_pubmed( @main.group() def bootstrap(): - "Bootstrap schema or data." + """Bootstrap schema or data. + + Starting with a general description or a LinkML schema, + generate an initial version of a knowledge base. + + The config should be a yaml file with the following fields: + kb_name: str + description: str + attributes: str + main_class: str + + For example, this is a valid config: + kb_name: lumber_kb + description: A knowledge base for lumber + attributes: source_tree + main_class: Lumber_Type + + Examples: + + curategpt bootstrap schema -C config.yaml + (This will generate a LinkML schema, based on the provided config.) + + curategpt bootstrap data -s schema.yaml + (This will generate data based on the provided schema. + The output of the previous command can be used as input for this command.) + """ @bootstrap.command(name="schema") @@ -841,7 +866,7 @@ def bootstrap(): help="path to yaml config", ) def bootstrap_schema(config, model): - """Bootstrap a knowledge base.""" + """Bootstrap a knowledge base with LinkML schema.""" extractor = BasicExtractor() if model: extractor.model_name = model @@ -865,7 +890,7 @@ def bootstrap_schema(config, model): help="path to yaml linkml schema", ) def bootstrap_data(config, schema, model): - """Bootstrap a knowledge base.""" + """Bootstrap a knowledge base with initial data.""" extractor = BasicExtractor() if model: extractor.model_name = model From dc750ea9c3962e09684512dbc13b9485f1ed7fc2 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Wed, 9 Oct 2024 17:23:57 -0400 Subject: [PATCH 3/9] Add Bootstrap function to app with schema generation --- src/curategpt/app/app.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/curategpt/app/app.py b/src/curategpt/app/app.py index 925b72a..c7a9e9d 100644 --- a/src/curategpt/app/app.py +++ b/src/curategpt/app/app.py @@ -11,6 +11,7 @@ from curategpt import BasicExtractor from curategpt.agents import MappingAgent +from curategpt.agents.bootstrap_agent import BootstrapAgent, KnowledgeBaseSpecification from curategpt.agents.chat_agent import ChatAgent, ChatResponse from curategpt.agents.dase_agent import DatabaseAugmentedStructuredExtraction from curategpt.agents.dragon_agent import DragonAgent @@ -37,6 +38,7 @@ SEARCH = "Search" CLUSTER_SEARCH = "Cluster Search" MATCH = "Match" +BOOTSTRAP = "Bootstrap" CURATE = "Curate" ADD_TO_CART = "Add to Cart" # EXTRACT = "Extract" @@ -77,6 +79,7 @@ EXTRACT, CITESEEK, MATCH, + BOOTSTRAP, CART, ABOUT, HELP, @@ -664,6 +667,39 @@ def _flat(obj: dict, limit=40) -> dict: st.subheader(f"Reference {ref}", anchor=f"ref-{ref}") st.code(text, language="yaml") +elif option == BOOTSTRAP: + page_state = state.get_page_state(BOOTSTRAP) + st.subheader(f"Generate a schema and data for a new knowledge base.") + + extractor = BasicExtractor() + extractor.model_name = model_name + bootstrap_agent = BootstrapAgent(extractor=extractor) + + kb_name = st.text_input("KB Name", help="Name of the knowledge base (e.g. 'My Knowledge Base')") + description = st.text_input( + "Description", + help="Description of the knowledge base (e.g. 'A knowledge base for my research')", + ) + attributes = st.text_input( + "Attributes", + help="Attributes of the knowledge base (e.g. 'flavor, viscosity, color')", + ) + main_class = st.text_input( + "Main Class", + help="Main class of the knowledge base (e.g. 'Ice Cream')", + ) + if st.button("Make Schema"): + st.write(f"Generating schema for *{kb_name}*") + config_dict = { + "kb_name": kb_name, + "description": description, + "attributes": attributes, + "main_class": main_class, + } + config = KnowledgeBaseSpecification(**config_dict) + ao = bootstrap_agent.bootstrap_schema(config) + st.write(ao.model_dump()) + elif option == CART: page_state = state.get_page_state(CART) st.subheader("Your items") From 22d61cae0e29fa02a35f4f464f2bafb3d699074b Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Wed, 9 Oct 2024 17:31:29 -0400 Subject: [PATCH 4/9] Expand bootstrap function in app to generate data --- src/curategpt/app/app.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/curategpt/app/app.py b/src/curategpt/app/app.py index c7a9e9d..1a72f6c 100644 --- a/src/curategpt/app/app.py +++ b/src/curategpt/app/app.py @@ -688,6 +688,12 @@ def _flat(obj: dict, limit=40) -> dict: "Main Class", help="Main class of the knowledge base (e.g. 'Ice Cream')", ) + generate_data = st.checkbox( + "Generate data", + help=""" + If checked, after generating the schema, generate example data. + """, + ) if st.button("Make Schema"): st.write(f"Generating schema for *{kb_name}*") config_dict = { @@ -698,7 +704,13 @@ def _flat(obj: dict, limit=40) -> dict: } config = KnowledgeBaseSpecification(**config_dict) ao = bootstrap_agent.bootstrap_schema(config) - st.write(ao.model_dump()) + schema_dict = ao.model_dump() + st.write(schema_dict) + + if generate_data: + st.write(f"Generating data for *{kb_name}*") + data = bootstrap_agent.bootstrap_data(schema=schema_dict) + st.code(data, language="yaml") elif option == CART: page_state = state.get_page_state(CART) From cbd9a731a7d330fedb47941f9c1ad6e0e6860691 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Wed, 9 Oct 2024 17:40:02 -0400 Subject: [PATCH 5/9] Update bootstrap template to include attribute input --- src/curategpt/conf/prompts/bootstrap-schema.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/curategpt/conf/prompts/bootstrap-schema.j2 b/src/curategpt/conf/prompts/bootstrap-schema.j2 index a66230c..89bdbc4 100644 --- a/src/curategpt/conf/prompts/bootstrap-schema.j2 +++ b/src/curategpt/conf/prompts/bootstrap-schema.j2 @@ -36,7 +36,7 @@ classes: The goal is to have a tree like model with {{ main_class }} as the root of the tree, and then lists of rich objects as the children of the root. Each rich object may need -its own class to describe it +its own class to describe it. Attributes of the main class should include {{ attributes }}. ## Examples From fe909fa15a690d825bbcbf8f2c2efdbe0664f768 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Wed, 9 Oct 2024 17:44:48 -0400 Subject: [PATCH 6/9] More template tweaks to avoid parsing errors --- src/curategpt/conf/prompts/bootstrap-schema.j2 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/curategpt/conf/prompts/bootstrap-schema.j2 b/src/curategpt/conf/prompts/bootstrap-schema.j2 index 89bdbc4..f1f5a79 100644 --- a/src/curategpt/conf/prompts/bootstrap-schema.j2 +++ b/src/curategpt/conf/prompts/bootstrap-schema.j2 @@ -151,6 +151,8 @@ classes: ## General Tips +- Do not include any markdown formatting such as code fences +- Do not include any commentary preceding the YAML - don't include base types like string etc under `types`, they are imported already - give {{ main_class }} a mixture of simple string metadata and lists of rich objects for associated data that might require provenance or other metadata. - make sure the YAML is complete. Even though I provide placeholder "..." in the examples, the YAML you provide must parse From 3e70666dac20d307d93ab36f6f8b3d19fae7d4f9 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Wed, 9 Oct 2024 17:54:12 -0400 Subject: [PATCH 7/9] Linting --- src/curategpt/app/app.py | 2 +- src/curategpt/cli.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/curategpt/app/app.py b/src/curategpt/app/app.py index 1a72f6c..2d873e8 100644 --- a/src/curategpt/app/app.py +++ b/src/curategpt/app/app.py @@ -669,7 +669,7 @@ def _flat(obj: dict, limit=40) -> dict: elif option == BOOTSTRAP: page_state = state.get_page_state(BOOTSTRAP) - st.subheader(f"Generate a schema and data for a new knowledge base.") + st.subheader("Generate a schema and data for a new knowledge base.") extractor = BasicExtractor() extractor.model_name = model_name diff --git a/src/curategpt/cli.py b/src/curategpt/cli.py index 0a944cd..f5ff96c 100644 --- a/src/curategpt/cli.py +++ b/src/curategpt/cli.py @@ -846,11 +846,11 @@ def bootstrap(): attributes: source_tree main_class: Lumber_Type - Examples: - + Examples: + curategpt bootstrap schema -C config.yaml (This will generate a LinkML schema, based on the provided config.) - + curategpt bootstrap data -s schema.yaml (This will generate data based on the provided schema. The output of the previous command can be used as input for this command.) From bbe18a8b5b4bb19bbf39ab5b77451e6154ca52f0 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Wed, 9 Oct 2024 17:56:14 -0400 Subject: [PATCH 8/9] Update app help strings --- src/curategpt/app/app.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/curategpt/app/app.py b/src/curategpt/app/app.py index 2d873e8..f26286f 100644 --- a/src/curategpt/app/app.py +++ b/src/curategpt/app/app.py @@ -675,10 +675,10 @@ def _flat(obj: dict, limit=40) -> dict: extractor.model_name = model_name bootstrap_agent = BootstrapAgent(extractor=extractor) - kb_name = st.text_input("KB Name", help="Name of the knowledge base (e.g. 'My Knowledge Base')") + kb_name = st.text_input("KB Name", help="Name of the knowledge base, without spaces (e.g. 'ice_cream_kb')") description = st.text_input( "Description", - help="Description of the knowledge base (e.g. 'A knowledge base for my research')", + help="Description of the knowledge base (e.g. 'A knowledge base for ice cream')", ) attributes = st.text_input( "Attributes", @@ -686,7 +686,7 @@ def _flat(obj: dict, limit=40) -> dict: ) main_class = st.text_input( "Main Class", - help="Main class of the knowledge base (e.g. 'Ice Cream')", + help="Main class of the knowledge base, without spaces (e.g. 'IceCreamType')", ) generate_data = st.checkbox( "Generate data", From 79c643ff2c20f218b6b59835791778185d66f093 Mon Sep 17 00:00:00 2001 From: caufieldjh Date: Wed, 9 Oct 2024 18:05:52 -0400 Subject: [PATCH 9/9] Another prompt tweak to avoid parsing problems --- src/curategpt/conf/prompts/bootstrap-schema.j2 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/curategpt/conf/prompts/bootstrap-schema.j2 b/src/curategpt/conf/prompts/bootstrap-schema.j2 index f1f5a79..f34a688 100644 --- a/src/curategpt/conf/prompts/bootstrap-schema.j2 +++ b/src/curategpt/conf/prompts/bootstrap-schema.j2 @@ -151,8 +151,9 @@ classes: ## General Tips -- Do not include any markdown formatting such as code fences +- Do not include any markdown formatting such as code fences, except as specified above - Do not include any commentary preceding the YAML +- Do not include any "`" characters within the YAML - don't include base types like string etc under `types`, they are imported already - give {{ main_class }} a mixture of simple string metadata and lists of rich objects for associated data that might require provenance or other metadata. - make sure the YAML is complete. Even though I provide placeholder "..." in the examples, the YAML you provide must parse