Merge branch 'master' into object-viewer-updates

wandb · Jan 21, 2025 · accacfb · accacfb
2 parents 7d6a32c + afe5c40
commit accacfb
Show file tree

Hide file tree

Showing 153 changed files with 8,410 additions and 2,560 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1,4 +1,5 @@
 * @wandb/weave-team
+/docs/ @wandb/docs-team @wandb/weave-team
 weave-js/src/common @wandb/fe-infra-reviewers
 weave-js/src/components @wandb/fe-infra-reviewers @wandb/weave-team
 weave-js/src/assets @wandb/fe-infra-reviewers @wandb/weave-team
diff --git a/.github/workflows/notify-wandb-core.yaml b/.github/workflows/notify-wandb-core.yaml
@@ -6,14 +6,38 @@ name: Notify wandb/core
 on:
   push:
     branches:
-      - '**'
+      - "**"
   workflow_dispatch:
 
+permissions:
+  packages: write
+
 jobs:
+  publish-package:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+      - name: Configure npm for GitHub Packages
+        run: |
+          echo "//npm.pkg.github.com/:_authToken=${{ secrets.GITHUB_TOKEN }}" >> weave-js/.npmrc
+      - name: Publish package
+        run: |
+          cd weave-js
+          yarn install --frozen-lockfile
+          npm version 0.0.0-${{ github.sha }} --no-git-tag-version
+          yarn generate
+          cp package.json README.md .npmrc src/
+          cd src
+          if [ "${{ github.ref }}" = "refs/heads/master" ]; then
+            npm publish
+          else
+            npm publish --tag prerelease
+          fi
   check-which-tests-to-run:
     uses: ./.github/workflows/check-which-tests-to-run.yaml
   notify-wandb-core:
-    needs: check-which-tests-to-run
+    needs: [check-which-tests-to-run, publish-package]
     runs-on: ubuntu-latest
     steps:
       - name: Repository dispatch

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -83,9 +83,9 @@ jobs:
           WANDB_ENABLE_TEST_CONTAINER: true
           LOGGING_ENABLED: true
         ports:
-          - '8080:8080'
-          - '8083:8083'
-          - '9015:9015'
+          - "8080:8080"
+          - "8083:8083"
+          - "9015:9015"
         options: >-
           --health-cmd "wget -q -O /dev/null http://localhost:8080/healthz || exit 1"
           --health-interval=5s
@@ -165,7 +165,10 @@ jobs:
       - uses: actions/setup-node@v1
         if: steps.check_run.outputs.should_lint_and_compile == 'true'
         with:
-          node-version: '18.x'
+          node-version: "18.x"
+      - name: Configure npm for GitHub Packages
+        run: |
+          echo "//npm.pkg.github.com/:_authToken=${{ secrets.GITHUB_TOKEN }}" >> .npmrc
       - name: Run WeaveJS Lint and Compile
         if: steps.check_run.outputs.should_lint_and_compile == 'true'
         run: |
@@ -218,36 +221,37 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version-major: ['3']
+        python-version-major: ["3"]
         python-version-minor: [
-            '9',
-            '10',
-            '11',
-            '12',
-            '13',
+            "9",
+            "10",
+            "11",
+            "12",
+            "13",
             #
           ]
         nox-shard:
           [
-            'trace',
-            'trace_server',
-            'anthropic',
-            'cerebras',
-            'cohere',
-            'dspy',
-            'groq',
-            'google_ai_studio',
-            'instructor',
-            'langchain',
-            'litellm',
-            'llamaindex',
-            'mistral0',
-            'mistral1',
-            'notdiamond',
-            'openai',
-            'vertexai',
-            'scorers_tests',
-            'pandas-test',
+            "trace",
+            "trace_server",
+            "anthropic",
+            "bedrock",
+            "cerebras",
+            "cohere",
+            "dspy",
+            "groq",
+            "google_ai_studio",
+            "instructor",
+            "langchain",
+            "litellm",
+            "llamaindex",
+            "mistral0",
+            "mistral1",
+            "notdiamond",
+            "openai",
+            "vertexai",
+            "scorers_tests",
+            "pandas-test",
           ]
       fail-fast: false
     services:
@@ -261,9 +265,9 @@ jobs:
           WANDB_ENABLE_TEST_CONTAINER: true
           LOGGING_ENABLED: true
         ports:
-          - '8080:8080'
-          - '8083:8083'
-          - '9015:9015'
+          - "8080:8080"
+          - "8083:8083"
+          - "9015:9015"
         options: >-
           --health-cmd "wget -q -O /dev/null http://localhost:8080/healthz || exit 1"
           --health-interval=5s
@@ -272,13 +276,15 @@ jobs:
       weave_clickhouse:
         image: clickhouse/clickhouse-server
         ports:
-          - '8123:8123'
+          - "8123:8123"
         options: --health-cmd "wget -nv -O- 'http://localhost:8123/ping' || exit 1" --health-interval=5s --health-timeout=3s
     steps:
       - name: Checkout
         uses: actions/checkout@v3
       - name: Enable debug logging
         run: echo "ACTIONS_STEP_DEBUG=true" >> $GITHUB_ENV
+      - name: Install SQLite dev package
+        run: sudo apt update && sudo apt install -y libsqlite3-dev
       - name: Set up Python ${{ matrix.python-version-major }}.${{ matrix.python-version-minor }}
         uses: actions/setup-python@v5
         with:
@@ -305,6 +311,7 @@ jobs:
           WB_SERVER_HOST: http://wandbservice
           WF_CLICKHOUSE_HOST: weave_clickhouse
           WEAVE_SERVER_DISABLE_ECOSYSTEM: 1
+          WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
           GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           MISTRAL_API_KEY: ${{ secrets.MISTRAL_API_KEY }}

diff --git a/docs/docs/guides/cookbooks/prod_dashboard.md b/docs/docs/guides/cookbooks/prod_dashboard.md
diff --git a/docs/docs/guides/cookbooks/summarization/.gitignore b/docs/docs/guides/cookbooks/summarization/.gitignore
diff --git a/docs/docs/guides/evaluation/guardrails_and_monitors.md b/docs/docs/guides/evaluation/guardrails_and_monitors.md
@@ -0,0 +1,143 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Guardrails and Monitors
+
+![Feedback](./../../../static/img/guardrails_scorers.png)
+
+Weave provides a robust framework for implementing safety controls and monitoring systems in LLM applications through a unified scoring system. This guide explains how to leverage scorers as both guardrails for active intervention and monitors for passive evaluation in production environments.
+
+## Core Concepts
+
+The foundation of Weave's evaluation system is the `Scorer` class. This abstract base class defines a scoring interface through its `score` method, which concrete implementations use to provide specific evaluation metrics. For a comprehensive overview of available metrics and custom scorer implementation, see [Evaluation Metrics](./scorers.md).
+
+Here's a basic example of a custom scorer:
+
+```python
+class MyScorer(Scorer):
+    def score(self, output: str) -> float:
+        """
+        Evaluate the given result and return a score between 0 and 1.
+        
+        Args:
+            result: The LLM-generated content to evaluate
+            
+        Returns:
+            float: Score indicating quality/safety (0 = fail, 1 = pass)
+        """
+        return 0.8  # Example score
+```
+
+### Applying Scorers
+
+Scorers are applied to operations using the `apply_scorer` method, which returns an `ApplyScorerResult`:
+
+```python
+@dataclass
+class ApplyScorerSuccess:
+    result: Any  # The original operation result
+    score_call: Call  # The scoring operation call object
+```
+
+Basic scorer application:
+
+```python
+# Get both operation result and Call object
+result, call = op.call(user_input)
+
+# Apply scorer and get evaluation results
+evaluation = await call.apply_scorer(scorer)
+```
+
+:::important
+Always use `op.call(user_input)` rather than direct invocation (`op(user_input)`) when working with scorers. This method returns both the operation result and a `Call` object required for scorer application.
+:::
+
+## Guardrails
+
+Guardrails provide active safety mechanisms by evaluating LLM outputs in real-time and intervening based on scorer results. They are essential for preventing inappropriate or harmful content generation in production systems.
+
+### Implementation
+
+```python
+async def process_with_guardrail(user_input: str) -> str:
+    """
+    Process user input with safety guardrails.
+    
+    Args:
+        user_input: The user's input to process
+        
+    Returns:
+        str: Processed result if guardrail passes, fallback response if it fails
+    """
+    result, call = op.call(user_input)
+    evaluation = await call.apply_scorer(guardrail)
+
+    if evaluation.score < 0.5:
+        return handle_failed_guardrail(result)
+    return result
+```
+
+## Monitors
+
+While guardrails provide active intervention, monitors offer passive evaluation and tracking of LLM operations. They are crucial for long-term quality assurance and system improvement.
+
+### Implementation
+
+```python
+async def monitored_operation(user_input: str, sampling_rate: float = 0.25) -> str:
+    """
+    Execute operation with monitoring.
+    
+    Args:
+        user_input: The input to process
+        sampling_rate: Percentage of operations to monitor (0.0 to 1.0)
+        
+    Returns:
+        str: Operation result
+    """
+    result, call = op.call(user_input)
+
+    # Apply monitoring based on sampling rate
+    if random.random() < sampling_rate:
+        await call.apply_scorer(scorer)
+
+    return result
+```
+
+:::caution Performance Considerations
+Scorer evaluations execute synchronously on the same machine as the operation. For high-throughput production environments, consider adjusting sampling rates based on load. Weave will soon support server-side scoring for high-throughput applications.
+:::
+
+## Analysis and Observability
+
+### Accessing Scorer Results
+
+All scorer results are automatically logged as Feedback records in Weave, accessible through multiple interfaces:
+
+1. **UI Dashboard**: Access detailed scoring history in the Call details page
+2. **Call Tables**: Filter and analyze scores across operations
+3. **Programmatic Access**: Query results through API endpoints
+
+![Feedback](./../../../static/img/guardrails_scorers.png)
+
+### Data Access Examples
+
+#### HTTP API
+```python
+calls = client.server.calls_query_stream({
+    # ... your filters
+    "include_feedback": True,  # Include all scorer results
+})
+```
+
+#### Python SDK
+```python
+# Retrieve comprehensive feedback data for a specific call
+call = client.get_call(call_id)
+feedback_data = call.feedback
+```
+
+## Next Steps
+
+- Deep dive into [Evaluation Metrics](./scorers.md)