diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 54c42593db72..c19ec18134dd 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,4 +1,5 @@ * @wandb/weave-team +/docs/ @wandb/docs-team @wandb/weave-team weave-js/src/common @wandb/fe-infra-reviewers weave-js/src/components @wandb/fe-infra-reviewers @wandb/weave-team weave-js/src/assets @wandb/fe-infra-reviewers @wandb/weave-team diff --git a/.github/workflows/notify-wandb-core.yaml b/.github/workflows/notify-wandb-core.yaml index 3a1fa818d343..8b573259690f 100644 --- a/.github/workflows/notify-wandb-core.yaml +++ b/.github/workflows/notify-wandb-core.yaml @@ -6,14 +6,38 @@ name: Notify wandb/core on: push: branches: - - '**' + - "**" workflow_dispatch: +permissions: + packages: write + jobs: + publish-package: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Configure npm for GitHub Packages + run: | + echo "//npm.pkg.github.com/:_authToken=${{ secrets.GITHUB_TOKEN }}" >> weave-js/.npmrc + - name: Publish package + run: | + cd weave-js + yarn install --frozen-lockfile + npm version 0.0.0-${{ github.sha }} --no-git-tag-version + yarn generate + cp package.json README.md .npmrc src/ + cd src + if [ "${{ github.ref }}" = "refs/heads/master" ]; then + npm publish + else + npm publish --tag prerelease + fi check-which-tests-to-run: uses: ./.github/workflows/check-which-tests-to-run.yaml notify-wandb-core: - needs: check-which-tests-to-run + needs: [check-which-tests-to-run, publish-package] runs-on: ubuntu-latest steps: - name: Repository dispatch diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index ab0a55fb384c..0b444bd4e1e9 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -83,9 +83,9 @@ jobs: WANDB_ENABLE_TEST_CONTAINER: true LOGGING_ENABLED: true ports: - - '8080:8080' - - '8083:8083' - - '9015:9015' + - "8080:8080" + - "8083:8083" + - "9015:9015" options: >- --health-cmd "wget -q -O /dev/null http://localhost:8080/healthz || exit 1" --health-interval=5s @@ -165,7 +165,10 @@ jobs: - uses: actions/setup-node@v1 if: steps.check_run.outputs.should_lint_and_compile == 'true' with: - node-version: '18.x' + node-version: "18.x" + - name: Configure npm for GitHub Packages + run: | + echo "//npm.pkg.github.com/:_authToken=${{ secrets.GITHUB_TOKEN }}" >> .npmrc - name: Run WeaveJS Lint and Compile if: steps.check_run.outputs.should_lint_and_compile == 'true' run: | @@ -218,36 +221,37 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version-major: ['3'] + python-version-major: ["3"] python-version-minor: [ - '9', - '10', - '11', - '12', - '13', + "9", + "10", + "11", + "12", + "13", # ] nox-shard: [ - 'trace', - 'trace_server', - 'anthropic', - 'cerebras', - 'cohere', - 'dspy', - 'groq', - 'google_ai_studio', - 'instructor', - 'langchain', - 'litellm', - 'llamaindex', - 'mistral0', - 'mistral1', - 'notdiamond', - 'openai', - 'vertexai', - 'scorers_tests', - 'pandas-test', + "trace", + "trace_server", + "anthropic", + "bedrock", + "cerebras", + "cohere", + "dspy", + "groq", + "google_ai_studio", + "instructor", + "langchain", + "litellm", + "llamaindex", + "mistral0", + "mistral1", + "notdiamond", + "openai", + "vertexai", + "scorers_tests", + "pandas-test", ] fail-fast: false services: @@ -261,9 +265,9 @@ jobs: WANDB_ENABLE_TEST_CONTAINER: true LOGGING_ENABLED: true ports: - - '8080:8080' - - '8083:8083' - - '9015:9015' + - "8080:8080" + - "8083:8083" + - "9015:9015" options: >- --health-cmd "wget -q -O /dev/null http://localhost:8080/healthz || exit 1" --health-interval=5s @@ -272,13 +276,15 @@ jobs: weave_clickhouse: image: clickhouse/clickhouse-server ports: - - '8123:8123' + - "8123:8123" options: --health-cmd "wget -nv -O- 'http://localhost:8123/ping' || exit 1" --health-interval=5s --health-timeout=3s steps: - name: Checkout uses: actions/checkout@v3 - name: Enable debug logging run: echo "ACTIONS_STEP_DEBUG=true" >> $GITHUB_ENV + - name: Install SQLite dev package + run: sudo apt update && sudo apt install -y libsqlite3-dev - name: Set up Python ${{ matrix.python-version-major }}.${{ matrix.python-version-minor }} uses: actions/setup-python@v5 with: @@ -305,6 +311,7 @@ jobs: WB_SERVER_HOST: http://wandbservice WF_CLICKHOUSE_HOST: weave_clickhouse WEAVE_SERVER_DISABLE_ECOSYSTEM: 1 + WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }} diff --git a/Makefile b/Makefile index 6c1ed017c0f9..364e61b53b32 100644 --- a/Makefile +++ b/Makefile @@ -14,6 +14,9 @@ docs: build: uv build +prerelease-dry-run: + uv run ./weave/scripts/prerelease_dry_run.py + prepare-release: docs build synchronize-base-object-schemas: diff --git a/dev_docs/RELEASE.md b/dev_docs/RELEASE.md index 6514f9d6730b..dcbba003e864 100644 --- a/dev_docs/RELEASE.md +++ b/dev_docs/RELEASE.md @@ -4,7 +4,9 @@ This document outlines how to publish a new Weave release to our public [PyPI pa 1. Verify the head of master is ready for release and announce merge freeze to the Weave team while the release is being published (Either ask an admin on the Weave repo to place a freeze on https://www.mergefreeze.com/ or use the mergefreeze Slack app if it is set up or just post in Slack) -2. You should also run through this [sample notebook](https://colab.research.google.com/drive/1DmkLzhFCFC0OoN-ggBDoG1nejGw2jQZy#scrollTo=29hJrcJQA7jZ) remember to install from master. You can also just run the [quickstart](http://wandb.me/weave_colab). +2. Manual Verifications: + - Run `make prerelease-dry-run` to verify that the dry run script works. + - You should also run through this [sample notebook](https://colab.research.google.com/drive/1DmkLzhFCFC0OoN-ggBDoG1nejGw2jQZy#scrollTo=29hJrcJQA7jZ) remember to install from master. You can also just run the [quickstart](http://wandb.me/weave_colab). 3. To prepare a PATCH release, go to GitHub Actions and run the [bump-python-sdk-version](https://github.com/wandb/weave/actions/workflows/bump_version.yaml) workflow on master. This will: diff --git a/docs/docs/guides/cookbooks/prod_dashboard.md b/docs/docs/guides/cookbooks/prod_dashboard.md deleted file mode 100644 index c76a5ef3ad55..000000000000 --- a/docs/docs/guides/cookbooks/prod_dashboard.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -sidebar_position: 2 -hide_table_of_contents: true ---- - -# Integrating with Weave: Case Study - Custom Dashboard for Production Monitoring -When we consider how well Weave can be intergated in existing processes and AI Apps we consider data input and data output as the two fundemantal characteristics: -1. **Data Input:** - * **Framework Agnostic Tracing:** Many different tools, packages, frameworks are used to create LLM apps (LangChain, LangGraph, LlamaIndex, AutoGen, CrewAI). Weave's single `@weave-op()` decorator makes it very easy to integrate with any framework and custom objects (THERE SHOULD BE A COOKBOOK FOR HOW TO INTEGRATE AND HOW TO DEAL WITH CUSTOM OBJECTS INCL. SERIALIZATION). For most common frameworks our team already did that making the tracking of apps as easy as initializing Weave before the start of the application. For how feedback can be flexibly integrated into Weave check out the Cookbook Series on Feedback (ADD LINK TO OTHER COOKBOOK HERE). - * **Openning API endpoints (soon):** LLM applications are very diverse (webpage in typescript, python scripts, java backends, on-device apps) but should still be trckable and traceable from anywhere. For most use-cases Weave is already proving native support (python and typescript coming soon), for the rest Weave makes it very easy to log traces or connect with existing tools (ONE AVAILABLE A COOKBOOK SHOULD BE LAUNCHED ONCE THE NEW APIs ARE EXPOSED). - -2. **Data Output**: Weave focuses on a) online monitoring (tracing generations, tracking governance metrics such as cost, latency, tokens) and b) offline evaluations (systematic benchmarking on eval datasets, human feedback loops, LLM judges, side-by-side comparisons). For both parts Weave provides strong visulalization capabiltities through the Weave UI. Sometimes, creating a visual extension based on the specific business or monitoring requirements makes sense - this is what we'll discover in this cookbook (SEEMS LIKE IN THE FUTURE WE'LL HAVE WEAVE BOARDS BACK FOR THAT). - -The introduction, specific use-case that we consider in this cookbook: -* In this cookbook we show how Weave's exposed APIs and functions make it very easy to create a custom dashboard for production monitoring as an extension to the Traces view in Weave. -* We will focus on creating aggregate views of the cost, latency, tokens, and provided user-feedback -* We will focus on providing aggregation functions across models, calls, and projects -* We will take a look at how to add alerts and guardrailes (GOOD FOR OTHER COOKBOOKS) \ No newline at end of file diff --git a/docs/docs/guides/cookbooks/summarization/.gitignore b/docs/docs/guides/cookbooks/summarization/.gitignore deleted file mode 100644 index 8b137891791f..000000000000 --- a/docs/docs/guides/cookbooks/summarization/.gitignore +++ /dev/null @@ -1 +0,0 @@ - diff --git a/docs/docs/guides/core-types/datasets.md b/docs/docs/guides/core-types/datasets.md index 820178185408..eb8741dcd4fa 100644 --- a/docs/docs/guides/core-types/datasets.md +++ b/docs/docs/guides/core-types/datasets.md @@ -11,9 +11,9 @@ This guide will show you how to: - Download the latest version - Iterate over examples -## Sample code +## Quickstart - + ```python import weave @@ -68,3 +68,56 @@ This guide will show you how to: + +## Alternate constructors + + + + Datasets can also be constructed from common Weave objects like `Call`s, and popular python objects like `pandas.DataFrame`s. + + + This can be useful if you want to create an example from specific examples. + + ```python + @weave.op + def model(task: str) -> str: + return f"Now working on {task}" + + res1, call1 = model.call(task="fetch") + res2, call2 = model.call(task="parse") + + dataset = Dataset.from_calls([call1, call2]) + # Now you can use the dataset to evaluate the model, etc. + ``` + + + + You can also freely convert between `Dataset`s and `pandas.DataFrame`s. + + ```python + import pandas as pd + + df = pd.DataFrame([ + {'id': '0', 'sentence': "He no likes ice cream.", 'correction': "He doesn't like ice cream."}, + {'id': '1', 'sentence': "She goed to the store.", 'correction': "She went to the store."}, + {'id': '2', 'sentence': "They plays video games all day.", 'correction': "They play video games all day."} + ]) + dataset = Dataset.from_pandas(df) + df2 = dataset.to_pandas() + + assert df.equals(df2) + ``` + + + + + + + + +```typescript +This feature is not available in TypeScript yet. Stay tuned! +``` + + + diff --git a/docs/docs/guides/core-types/env-vars.md b/docs/docs/guides/core-types/env-vars.md new file mode 100644 index 000000000000..5a21cebb91ce --- /dev/null +++ b/docs/docs/guides/core-types/env-vars.md @@ -0,0 +1,28 @@ +# Environment variables + +Weave provides a set of environment variables to configure and optimize its behavior. You can set these variables in your shell or within scripts to control specific functionality. + +```bash +# Example of setting environment variables in the shell +WEAVE_PARALLELISM=10 # Controls the number of parallel workers +WEAVE_PRINT_CALL_LINK=false # Disables call link output +``` + +```python +# Example of setting environment variables in Python +import os + +os.environ["WEAVE_PARALLELISM"] = "10" +os.environ["WEAVE_PRINT_CALL_LINK"] = "false" +``` + +## Environment variables reference + +| Variable Name | Description | +|--------------------------|-----------------------------------------------------------------| +| WEAVE_CAPTURE_CODE | Disable code capture for `weave.op` if set to `false`. | +| WEAVE_DEBUG_HTTP | If set to `1`, turns on HTTP request and response logging for debugging. | +| WEAVE_DISABLED | If set to `true`, all tracing to Weave is disabled. | +| WEAVE_PARALLELISM | In evaluations, the number of examples to evaluate in parallel. `1` runs examples sequentially. Default value is `20`. | +| WEAVE_PRINT_CALL_LINK | If set to `false`, call URL printing is suppressed. Default value is `false`. | +| WEAVE_TRACE_LANGCHAIN | When set to `false`, explicitly disable global tracing for LangChain. | | diff --git a/docs/docs/guides/core-types/media.md b/docs/docs/guides/core-types/media.md index f097e3a0674f..0a0cfa6e2b75 100644 --- a/docs/docs/guides/core-types/media.md +++ b/docs/docs/guides/core-types/media.md @@ -9,7 +9,7 @@ Weave supports logging and displaying multiple first class media types. Log imag Logging type: `PIL.Image.Image`. Here is an example of logging an image with the OpenAI DALL-E API: - + ```python @@ -83,7 +83,7 @@ This image will be logged to weave and automatically displayed in the UI. The fo Logging type: `wave.Wave_read`. Here is an example of logging an audio file using openai's speech generation API. - + ```python diff --git a/docs/docs/guides/core-types/models.md b/docs/docs/guides/core-types/models.md index 83c16aa19a2c..1a9c70c4d7c7 100644 --- a/docs/docs/guides/core-types/models.md +++ b/docs/docs/guides/core-types/models.md @@ -3,7 +3,7 @@ import TabItem from '@theme/TabItem'; # Models - + A `Model` is a combination of data (which can include configuration, trained model weights, or other information) and code that defines how the model operates. By structuring your code to be compatible with this API, you benefit from a structured way to version your application so you can more systematically keep track of your experiments. diff --git a/docs/docs/guides/evaluation/guardrails_and_monitors.md b/docs/docs/guides/evaluation/guardrails_and_monitors.md new file mode 100644 index 000000000000..bf526554b2d3 --- /dev/null +++ b/docs/docs/guides/evaluation/guardrails_and_monitors.md @@ -0,0 +1,143 @@ +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Guardrails and Monitors + +![Feedback](./../../../static/img/guardrails_scorers.png) + +Weave provides a robust framework for implementing safety controls and monitoring systems in LLM applications through a unified scoring system. This guide explains how to leverage scorers as both guardrails for active intervention and monitors for passive evaluation in production environments. + +## Core Concepts + +The foundation of Weave's evaluation system is the `Scorer` class. This abstract base class defines a scoring interface through its `score` method, which concrete implementations use to provide specific evaluation metrics. For a comprehensive overview of available metrics and custom scorer implementation, see [Evaluation Metrics](./scorers.md). + +Here's a basic example of a custom scorer: + +```python +class MyScorer(Scorer): + def score(self, output: str) -> float: + """ + Evaluate the given result and return a score between 0 and 1. + + Args: + result: The LLM-generated content to evaluate + + Returns: + float: Score indicating quality/safety (0 = fail, 1 = pass) + """ + return 0.8 # Example score +``` + +### Applying Scorers + +Scorers are applied to operations using the `apply_scorer` method, which returns an `ApplyScorerResult`: + +```python +@dataclass +class ApplyScorerSuccess: + result: Any # The original operation result + score_call: Call # The scoring operation call object +``` + +Basic scorer application: + +```python +# Get both operation result and Call object +result, call = op.call(user_input) + +# Apply scorer and get evaluation results +evaluation = await call.apply_scorer(scorer) +``` + +:::important +Always use `op.call(user_input)` rather than direct invocation (`op(user_input)`) when working with scorers. This method returns both the operation result and a `Call` object required for scorer application. +::: + +## Guardrails + +Guardrails provide active safety mechanisms by evaluating LLM outputs in real-time and intervening based on scorer results. They are essential for preventing inappropriate or harmful content generation in production systems. + +### Implementation + +```python +async def process_with_guardrail(user_input: str) -> str: + """ + Process user input with safety guardrails. + + Args: + user_input: The user's input to process + + Returns: + str: Processed result if guardrail passes, fallback response if it fails + """ + result, call = op.call(user_input) + evaluation = await call.apply_scorer(guardrail) + + if evaluation.score < 0.5: + return handle_failed_guardrail(result) + return result +``` + +## Monitors + +While guardrails provide active intervention, monitors offer passive evaluation and tracking of LLM operations. They are crucial for long-term quality assurance and system improvement. + +### Implementation + +```python +async def monitored_operation(user_input: str, sampling_rate: float = 0.25) -> str: + """ + Execute operation with monitoring. + + Args: + user_input: The input to process + sampling_rate: Percentage of operations to monitor (0.0 to 1.0) + + Returns: + str: Operation result + """ + result, call = op.call(user_input) + + # Apply monitoring based on sampling rate + if random.random() < sampling_rate: + await call.apply_scorer(scorer) + + return result +``` + +:::caution Performance Considerations +Scorer evaluations execute synchronously on the same machine as the operation. For high-throughput production environments, consider adjusting sampling rates based on load. Weave will soon support server-side scoring for high-throughput applications. +::: + +## Analysis and Observability + +### Accessing Scorer Results + +All scorer results are automatically logged as Feedback records in Weave, accessible through multiple interfaces: + +1. **UI Dashboard**: Access detailed scoring history in the Call details page +2. **Call Tables**: Filter and analyze scores across operations +3. **Programmatic Access**: Query results through API endpoints + +![Feedback](./../../../static/img/guardrails_scorers.png) + +### Data Access Examples + +#### HTTP API +```python +calls = client.server.calls_query_stream({ + # ... your filters + "include_feedback": True, # Include all scorer results +}) +``` + +#### Python SDK +```python +# Retrieve comprehensive feedback data for a specific call +call = client.get_call(call_id) +feedback_data = call.feedback +``` + +## Next Steps + +- Deep dive into [Evaluation Metrics](./scorers.md) diff --git a/docs/docs/guides/evaluation/scorers.md b/docs/docs/guides/evaluation/scorers.md index 08cf5b1b64c0..e21be9a1c94e 100644 --- a/docs/docs/guides/evaluation/scorers.md +++ b/docs/docs/guides/evaluation/scorers.md @@ -7,7 +7,7 @@ import TabItem from '@theme/TabItem'; In Weave, Scorers are used to evaluate AI outputs and return evaluation metrics. They take the AI's output, analyze it, and return a dictionary of results. Scorers can use your input data as reference if needed and can also output extra information, such as explanations or reasonings from the evaluation. - + Scorers are passed to a `weave.Evaluation` object during evaluation. There are two types of Scorers in weave: @@ -26,7 +26,7 @@ In Weave, Scorers are used to evaluate AI outputs and return evaluation metrics. ### Function-based Scorers - + These are functions decorated with `@weave.op` that return a dictionary. They're great for simple evaluations like: @@ -68,7 +68,7 @@ In Weave, Scorers are used to evaluate AI outputs and return evaluation metrics. ### Class-based Scorers - + For more advanced evaluations, especially when you need to keep track of additional scorer metadata, try different prompts for your LLM-evaluators, or make multiple function calls, you can use the `Scorer` class. @@ -139,7 +139,7 @@ In Weave, Scorers are used to evaluate AI outputs and return evaluation metrics. ### Scorer Keyword Arguments - + Scorers can access both the output from your AI system and the input data from the dataset row. @@ -256,7 +256,7 @@ In Weave, Scorers are used to evaluate AI outputs and return evaluation metrics. ### Final summarization of the scorer - + During evaluation, the scorer will be computed for each row of your dataset. To provide a final score for the evaluation we provide an `auto_summarize` depending on the returning type of the output. - Averages are computed for numerical columns @@ -305,7 +305,7 @@ In Weave, Scorers are used to evaluate AI outputs and return evaluation metrics. ## Predefined Scorers - + **Installation** diff --git a/docs/docs/guides/integrations/bedrock.md b/docs/docs/guides/integrations/bedrock.md new file mode 100644 index 000000000000..0fe0db80c0c7 --- /dev/null +++ b/docs/docs/guides/integrations/bedrock.md @@ -0,0 +1,137 @@ +# Amazon Bedrock + +Weave automatically tracks and logs LLM calls made via Amazon Bedrock, AWS's fully managed service that offers foundation models from leading AI companies through a unified API. + +## Traces + +Weave will automatically capture traces for Bedrock API calls. You can use the Bedrock client as usual after initializing Weave and patching the client: + +```python +import weave +import boto3 +import json +from weave.integrations.bedrock.bedrock_sdk import patch_client + +weave.init("my_bedrock_app") + +# Create and patch the Bedrock client +client = boto3.client("bedrock-runtime") +patch_client(client) + +# Use the client as usual +response = client.invoke_model( + modelId="anthropic.claude-3-5-sonnet-20240620-v1:0", + body=json.dumps({ + "anthropic_version": "bedrock-2023-05-31", + "max_tokens": 100, + "messages": [ + {"role": "user", "content": "What is the capital of France?"} + ] + }), + contentType='application/json', + accept='application/json' +) +response_dict = json.loads(response.get('body').read()) +print(response_dict["content"][0]["text"]) +``` + +of using the `converse` API: + +```python +messages = [{"role": "user", "content": [{"text": "What is the capital of France?"}]}] + +response = client.converse( + modelId="anthropic.claude-3-5-sonnet-20240620-v1:0", + system=[{"text": "You are a helpful AI assistant."}], + messages=messages, + inferenceConfig={"maxTokens": 100}, +) +print(response["output"]["message"]["content"][0]["text"]) + +``` + +## Wrapping with your own ops + +You can create reusable operations using the `@weave.op()` decorator. Here's an example showing both the `invoke_model` and `converse` APIs: + +```python +@weave.op +def call_model_invoke( + model_id: str, + prompt: str, + max_tokens: int = 100, + temperature: float = 0.7 +) -> dict: + body = json.dumps({ + "anthropic_version": "bedrock-2023-05-31", + "max_tokens": max_tokens, + "temperature": temperature, + "messages": [ + {"role": "user", "content": prompt} + ] + }) + + response = client.invoke_model( + modelId=model_id, + body=body, + contentType='application/json', + accept='application/json' + ) + return json.loads(response.get('body').read()) + +@weave.op +def call_model_converse( + model_id: str, + messages: str, + system_message: str, + max_tokens: int = 100, +) -> dict: + response = client.converse( + modelId=model_id, + system=[{"text": system_message}], + messages=messages, + inferenceConfig={"maxTokens": max_tokens}, + ) + return response +``` + +![](./imgs/bedrock_converse.png) + +## Create a `Model` for easier experimentation + +You can create a Weave Model to better organize your experiments and capture parameters. Here's an example using the `converse` API: + +```python +class BedrockLLM(weave.Model): + model_id: str + max_tokens: int = 100 + system_message: str = "You are a helpful AI assistant." + + @weave.op + def predict(self, prompt: str) -> str: + "Generate a response using Bedrock's converse API" + + messages = [{ + "role": "user", + "content": [{"text": prompt}] + }] + + response = client.converse( + modelId=self.model_id, + system=[{"text": self.system_message}], + messages=messages, + inferenceConfig={"maxTokens": self.max_tokens}, + ) + return response["output"]["message"]["content"][0]["text"] + +# Create and use the model +model = BedrockLLM( + model_id="anthropic.claude-3-5-sonnet-20240620-v1:0", + max_tokens=100, + system_message="You are an expert software engineer that knows a lot of programming. You prefer short answers." +) +result = model.predict("What is the best way to handle errors in Python?") +print(result) +``` + +This approach allows you to version your experiments and easily track different configurations of your Bedrock-based application. diff --git a/docs/docs/guides/integrations/cohere.md b/docs/docs/guides/integrations/cohere.md index 9ff98bfc7fec..15454e0ec02a 100644 --- a/docs/docs/guides/integrations/cohere.md +++ b/docs/docs/guides/integrations/cohere.md @@ -47,10 +47,10 @@ Weave ops make results *reproducible* by automatically versioning code as you ex import cohere import os import weave -from weave.integrations.cohere import cohere_patcher +from weave.integrations.cohere import get_cohere_patcher # we need to patch before we create the client -cohere_patcher.attempt_patch() +get_cohere_patcher().attempt_patch() co = cohere.Client(api_key=os.environ["COHERE_API_KEY"]) diff --git a/docs/docs/guides/integrations/google-gemini.md b/docs/docs/guides/integrations/google-gemini.md index 6afc2790b3d6..208db3634805 100644 --- a/docs/docs/guides/integrations/google-gemini.md +++ b/docs/docs/guides/integrations/google-gemini.md @@ -2,7 +2,7 @@ Google offers two ways of calling Gemini via API: -1. Via the [Vertex APIs](https://cloud.google.com/vertexai/docs). +1. Via the [Vertex APIs](https://cloud.google.com/vertex-ai/docs). 2. Via the [Gemini API SDK](https://ai.google.dev/gemini-api/docs/quickstart?lang=python). ## Tracing diff --git a/docs/docs/guides/integrations/imgs/bedrock_converse.png b/docs/docs/guides/integrations/imgs/bedrock_converse.png new file mode 100644 index 000000000000..1e1fb281baf0 Binary files /dev/null and b/docs/docs/guides/integrations/imgs/bedrock_converse.png differ diff --git a/docs/docs/guides/integrations/nvidia_nim.md b/docs/docs/guides/integrations/nvidia_nim.md index 01007ee7e6a6..8aa26f4f3f35 100644 --- a/docs/docs/guides/integrations/nvidia_nim.md +++ b/docs/docs/guides/integrations/nvidia_nim.md @@ -9,7 +9,7 @@ Weave automatically tracks and logs LLM calls made via the [ChatNVIDIA](https:// It’s important to store traces of LLM applications in a central database, both during development and in production. You’ll use these traces for debugging and to help build a dataset of tricky examples to evaluate against while improving your application. - + Weave can automatically capture traces for the [ChatNVIDIA python library](https://python.langchain.com/docs/integrations/chat/nvidia_ai_endpoints/). @@ -43,7 +43,7 @@ It’s important to store traces of LLM applications in a central database, both ## Track your own ops - + Wrapping a function with `@weave.op` starts capturing inputs, outputs and app logic so you can debug how data flows through your app. You can deeply nest ops and build a tree of functions that you want to track. This also starts automatically versioning code as you experiment to capture ad-hoc details that haven't been committed to git. @@ -119,7 +119,7 @@ Navigate to Weave and you can click `get_pokemon_data` in the UI to see the inpu ## Create a `Model` for easier experimentation - + Organizing experimentation is difficult when there are many moving pieces. By using the [`Model`](/guides/core-types/models) class, you can capture and organize the experimental details of your app like your system prompt or the model you're using. This helps organize and compare different iterations of your app. diff --git a/docs/docs/guides/integrations/openai.md b/docs/docs/guides/integrations/openai.md index 541732f5060f..b36951bc27b8 100644 --- a/docs/docs/guides/integrations/openai.md +++ b/docs/docs/guides/integrations/openai.md @@ -7,7 +7,7 @@ import TabItem from '@theme/TabItem'; It’s important to store traces of LLM applications in a central database, both during development and in production. You’ll use these traces for debugging and to help build a dataset of tricky examples to evaluate against while improving your application. - + Weave can automatically capture traces for the [openai python library](https://platform.openai.com/docs/libraries/python-library). @@ -79,7 +79,7 @@ It’s important to store traces of LLM applications in a central database, both ## Track your own ops - + Wrapping a function with `@weave.op` starts capturing inputs, outputs and app logic so you can debug how data flows through your app. You can deeply nest ops and build a tree of functions that you want to track. This also starts automatically versioning code as you experiment to capture ad-hoc details that haven't been committed to git. @@ -249,7 +249,7 @@ Wrapping a function with `weave.op` starts capturing inputs, outputs and app log ## Create a `Model` for easier experimentation - + Organizing experimentation is difficult when there are many moving pieces. By using the [`Model`](/guides/core-types/models) class, you can capture and organize the experimental details of your app like your system prompt or the model you're using. This helps organize and compare different iterations of your app. diff --git a/docs/docs/guides/tools/playground.md b/docs/docs/guides/tools/playground.md index d6e92e43fa93..1382b3a2fe07 100644 --- a/docs/docs/guides/tools/playground.md +++ b/docs/docs/guides/tools/playground.md @@ -24,6 +24,7 @@ Get started with the Playground to optimize your LLM interactions and streamline - [Retry, edit, and delete messages](#retry-edit-and-delete-messages) - [Add a new message](#add-a-new-message) - [Compare LLMs](#compare-llms) +- [Adjust the number of trials](#adjust-the-number-of-trials) ## Prerequisites @@ -222,3 +223,10 @@ Playground allows you to compare LLMs. To perform a comparison, do the following - [Adjust parameters](#adjust-llm-parameters) - [Add functions](#add-a-function) 3. In the message box, enter a message that you want to test with both models and press **Send**. + +## Adjust the number of trials + +Playground allows you to generate multiple outputs for the same input by setting the number of trials. The default setting is `1`. To adjust the number of trials, do the following: + +1. In the Playground UI, open the settings sidebar if it is not already open. +2. Adjust the **Number of trials**. \ No newline at end of file diff --git a/docs/docs/guides/tracking/costs.md b/docs/docs/guides/tracking/costs.md index cf130eabea2a..e94c89a40daf 100644 --- a/docs/docs/guides/tracking/costs.md +++ b/docs/docs/guides/tracking/costs.md @@ -5,7 +5,7 @@ import TabItem from '@theme/TabItem'; ## Adding a custom cost - + You can add a custom cost by using the [`add_cost`](/reference/python-sdk/weave/trace/weave.trace.weave_client#method-add_cost) method. The three required fields are `llm_id`, `prompt_token_cost`, and `completion_token_cost`. @@ -45,7 +45,7 @@ import TabItem from '@theme/TabItem'; ## Querying for costs - + You can query for costs by using the [`query_costs`](/reference/python-sdk/weave/trace/weave.trace.weave_client#method-query_costs) method. There are a few ways to query for costs, you can pass in a singular cost id, or a list of LLM model names. @@ -72,7 +72,7 @@ import TabItem from '@theme/TabItem'; ## Purging a custom cost - + You can purge a custom cost by using the [`purge_costs`](/reference/python-sdk/weave/trace/weave.trace.weave_client#method-purge_costs) method. You pass in a list of cost ids, and the costs with those ids are purged. @@ -95,7 +95,7 @@ import TabItem from '@theme/TabItem'; ## Calculating costs for a Project - + You can calculate costs for a project by using our `calls_query` and adding `include_costs=True` with a little bit of setup. diff --git a/docs/docs/guides/tracking/feedback.md b/docs/docs/guides/tracking/feedback.md index 68e27776d444..d3e416e24443 100644 --- a/docs/docs/guides/tracking/feedback.md +++ b/docs/docs/guides/tracking/feedback.md @@ -75,7 +75,7 @@ You can also get additional information for each feedback object in `client.get_ - `feedback_type`: The type of feedback (reaction, note, custom). - `payload`: The feedback payload - + ```python import weave @@ -115,7 +115,7 @@ You can add feedback to a call using the call's UUID. To use the UUID to get a p - `call.feedback.add_note("")`: Add a note. - `call.feedback.add("