Merge branch 'master' into bedrock-integration

wandb · Jan 9, 2025 · 7cd5386 · 7cd5386
2 parents bf7214a + 93fd64f
commit 7cd5386
Show file tree

Hide file tree

Showing 20 changed files with 843 additions and 464 deletions.
diff --git a/docs/docs/guides/core-types/env-vars.md b/docs/docs/guides/core-types/env-vars.md
@@ -0,0 +1,28 @@
+# Environment variables
+
+Weave provides a set of environment variables to configure and optimize its behavior. You can set these variables in your shell or within scripts to control specific functionality.
+
+```bash
+# Example of setting environment variables in the shell
+WEAVE_PARALLELISM=10  # Controls the number of parallel workers
+WEAVE_PRINT_CALL_LINK=false  # Disables call link output
+```
+
+```python
+# Example of setting environment variables in Python
+import os
+
+os.environ["WEAVE_PARALLELISM"] = "10"
+os.environ["WEAVE_PRINT_CALL_LINK"] = "false"
+```
+
+## Environment variables reference 
+
+| Variable Name            | Description                                                     |
+|--------------------------|-----------------------------------------------------------------|
+| WEAVE_CAPTURE_CODE      | Disable code capture for `weave.op` if set to `false`.                                    |
+| WEAVE_DEBUG_HTTP        | If set to `1`, turns on HTTP request and response logging for debugging.  |
+| WEAVE_DISABLED          | If set to `true`, all tracing to Weave is disabled.      |
+| WEAVE_PARALLELISM       | In evaluations, the number of examples to evaluate in parallel. `1` runs examples sequentially. Default value is `20`.    |
+| WEAVE_PRINT_CALL_LINK   | If set to `false`, call URL printing is suppressed. Default value is `false`.                            |
+| WEAVE_TRACE_LANGCHAIN   | When set to `false`,  explicitly disable global tracing for LangChain.  |                                                              |
diff --git a/docs/sidebars.ts b/docs/sidebars.ts
@@ -70,6 +70,7 @@ const sidebars: SidebarsConfig = {
         "guides/tools/comparison",
         "guides/tools/playground",
         "guides/core-types/media",
+        "guides/core-types/env-vars",
         {
           type: "category",
           collapsible: true,

diff --git a/tests/trace/test_call_apply_scorer.py b/tests/trace/test_call_apply_scorer.py
@@ -0,0 +1,211 @@
+from __future__ import annotations
+
+import pytest
+
+import weave
+from weave.scorers.base_scorer import ApplyScorerResult
+from weave.trace.op import OpCallError
+from weave.trace.refs import CallRef
+from weave.trace.weave_client import Call, Op, WeaveClient
+
+
+def do_assertions_for_scorer_op(
+    apply_score_res: ApplyScorerResult,
+    call: Call,
+    score_fn: Op | weave.Scorer,
+    client: WeaveClient,
+):
+    assert apply_score_res.score_call.id is not None
+    assert apply_score_res.result == 0
+
+    feedbacks = list(call.feedback)
+    assert len(feedbacks) == 1
+    target_feedback = feedbacks[0]
+    scorer_name = (
+        score_fn.name if isinstance(score_fn, Op) else score_fn.__class__.__name__
+    )
+    assert target_feedback.feedback_type == "wandb.runnable." + scorer_name
+    assert target_feedback.runnable_ref == score_fn.ref.uri()
+    assert (
+        target_feedback.call_ref
+        == CallRef(
+            entity=client.entity,
+            project=client.project,
+            id=apply_score_res.score_call.id,
+        ).uri()
+    )
+    assert target_feedback.payload == {"output": apply_score_res.result}
+
+
+@pytest.mark.asyncio
+async def test_scorer_op_no_context(client: WeaveClient):
+    @weave.op
+    def predict(x):
+        return x + 1
+
+    @weave.op
+    def score_fn(x, output):
+        return output - x - 1
+
+    _, call = predict.call(1)
+    apply_score_res = await call.apply_scorer(score_fn)
+    do_assertions_for_scorer_op(apply_score_res, call, score_fn, client)
+
+    @weave.op
+    def score_fn_with_incorrect_args(y, output):
+        return output - y
+
+    with pytest.raises(OpCallError):
+        apply_score_res = await call.apply_scorer(score_fn_with_incorrect_args)
+
+
+@pytest.mark.asyncio
+async def test_scorer_op_with_context(client: WeaveClient):
+    @weave.op
+    def predict(x):
+        return x + 1
+
+    @weave.op
+    def score_fn(x, output, correct_answer):
+        return output - correct_answer
+
+    _, call = predict.call(1)
+    apply_score_res = await call.apply_scorer(
+        score_fn, additional_scorer_kwargs={"correct_answer": 2}
+    )
+    do_assertions_for_scorer_op(apply_score_res, call, score_fn, client)
+
+    @weave.op
+    def score_fn_with_incorrect_args(x, output, incorrect_arg):
+        return output - incorrect_arg
+
+    with pytest.raises(OpCallError):
+        apply_score_res = await call.apply_scorer(
+            score_fn_with_incorrect_args, additional_scorer_kwargs={"correct_answer": 2}
+        )
+
+
+@pytest.mark.asyncio
+async def test_async_scorer_op(client: WeaveClient):
+    @weave.op
+    def predict(x):
+        return x + 1
+
+    @weave.op
+    async def score_fn(x, output):
+        return output - x - 1
+
+    _, call = predict.call(1)
+    apply_score_res = await call.apply_scorer(score_fn)
+    do_assertions_for_scorer_op(apply_score_res, call, score_fn, client)
+
+    @weave.op
+    async def score_fn_with_incorrect_args(y, output):
+        return output - y
+
+    with pytest.raises(OpCallError):
+        apply_score_res = await call.apply_scorer(score_fn_with_incorrect_args)
+
+
+@pytest.mark.asyncio
+async def test_scorer_obj_no_context(client: WeaveClient):
+    @weave.op
+    def predict(x):
+        return x + 1
+
+    class MyScorer(weave.Scorer):
+        offset: int
+
+        @weave.op
+        def score(self, x, output):
+            return output - x - self.offset
+
+    scorer = MyScorer(offset=1)
+
+    _, call = predict.call(1)
+    apply_score_res = await call.apply_scorer(scorer)
+    do_assertions_for_scorer_op(apply_score_res, call, scorer, client)
+
+    class MyScorerWithIncorrectArgs(weave.Scorer):
+        offset: int
+
+        @weave.op
+        def score(self, y, output):
+            return output - y - self.offset
+
+    with pytest.raises(OpCallError):
+        apply_score_res = await call.apply_scorer(MyScorerWithIncorrectArgs(offset=1))
+
+
+@pytest.mark.asyncio
+async def test_scorer_obj_with_context(client: WeaveClient):
+    @weave.op
+    def predict(x):
+        return x + 1
+
+    class MyScorer(weave.Scorer):
+        offset: int
+
+        @weave.op
+        def score(self, x, output, correct_answer):
+            return output - correct_answer - self.offset
+
+    scorer = MyScorer(offset=0)
+
+    _, call = predict.call(1)
+    apply_score_res = await call.apply_scorer(
+        scorer, additional_scorer_kwargs={"correct_answer": 2}
+    )
+    do_assertions_for_scorer_op(apply_score_res, call, scorer, client)
+
+    class MyScorerWithIncorrectArgs(weave.Scorer):
+        offset: int
+
+        @weave.op
+        def score(self, y, output, incorrect_arg):
+            return output - incorrect_arg - self.offset
+
+    with pytest.raises(OpCallError):
+        apply_score_res = await call.apply_scorer(
+            MyScorerWithIncorrectArgs(offset=0),
+            additional_scorer_kwargs={"incorrect_arg": 2},
+        )
+
+    class MyScorerWithIncorrectArgsButCorrectColumnMapping(weave.Scorer):
+        offset: int
+
+        @weave.op
+        def score(self, y, output, incorrect_arg):
+            return output - incorrect_arg - self.offset
+
+    scorer = MyScorerWithIncorrectArgsButCorrectColumnMapping(
+        offset=0, column_map={"y": "x", "incorrect_arg": "correct_answer"}
+    )
+
+    _, call = predict.call(1)
+    apply_score_res = await call.apply_scorer(
+        scorer, additional_scorer_kwargs={"correct_answer": 2}
+    )
+    do_assertions_for_scorer_op(apply_score_res, call, scorer, client)
+
+
+@pytest.mark.asyncio
+async def test_async_scorer_obj(client: WeaveClient):
+    @weave.op
+    def predict(x):
+        return x + 1
+
+    class MyScorer(weave.Scorer):
+        offset: int
+
+        @weave.op
+        async def score(self, x, output):
+            return output - x - 1
+
+    scorer = MyScorer(offset=0)
+
+    _, call = predict.call(1)
+    apply_score_res = await call.apply_scorer(
+        scorer, additional_scorer_kwargs={"correct_answer": 2}
+    )
+    do_assertions_for_scorer_op(apply_score_res, call, scorer, client)
diff --git a/tests/trace/test_feedback.py b/tests/trace/test_feedback.py
@@ -346,7 +346,7 @@ def test_runnable_feedback(client: WeaveClient) -> None:
     }
 
 
-def populate_feedback(client: WeaveClient) -> None:
+async def populate_feedback(client: WeaveClient) -> None:
     @weave.op
     def my_scorer(x: int, output: int) -> int:
         expected = ["a", "b", "c", "d"][x]
@@ -369,21 +369,22 @@ def my_model(x: int) -> str:
     for x in range(4):
         _, c = my_model.call(x)
         ids.append(c.id)
-        c._apply_scorer(my_scorer)
+        await c.apply_scorer(my_scorer)
 
     assert len(list(my_scorer.calls())) == 4
     assert len(list(my_model.calls())) == 4
 
     return ids, my_scorer, my_model
 
 
-def test_sort_by_feedback(client: WeaveClient) -> None:
+@pytest.mark.asyncio
+async def test_sort_by_feedback(client: WeaveClient) -> None:
     if client_is_sqlite(client):
         # Not implemented in sqlite - skip
         return pytest.skip()
 
     """Test sorting by feedback."""
-    ids, my_scorer, my_model = populate_feedback(client)
+    ids, my_scorer, my_model = await populate_feedback(client)
 
     for fields, asc_ids in [
         (
@@ -441,13 +442,14 @@ def test_sort_by_feedback(client: WeaveClient) -> None:
         ), f"Sorting by {fields} descending failed, expected {asc_ids[::-1]}, got {found_ids}"
 
 
-def test_filter_by_feedback(client: WeaveClient) -> None:
+@pytest.mark.asyncio
+async def test_filter_by_feedback(client: WeaveClient) -> None:
     if client_is_sqlite(client):
         # Not implemented in sqlite - skip
         return pytest.skip()
 
     """Test filtering by feedback."""
-    ids, my_scorer, my_model = populate_feedback(client)
+    ids, my_scorer, my_model = await populate_feedback(client)
     for field, value, eq_ids, gt_ids in [
         (
             "feedback.[wandb.runnable.my_scorer].payload.output.model_output",
@@ -514,13 +516,14 @@ def __eq__(self, other):
         return isinstance(other, datetime.datetime)
 
 
-def test_filter_and_sort_by_feedback(client: WeaveClient) -> None:
+@pytest.mark.asyncio
+async def test_filter_and_sort_by_feedback(client: WeaveClient) -> None:
     if client_is_sqlite(client):
         # Not implemented in sqlite - skip
         return pytest.skip()
 
     """Test filtering by feedback."""
-    ids, my_scorer, my_model = populate_feedback(client)
+    ids, my_scorer, my_model = await populate_feedback(client)
     calls = client.server.calls_query_stream(
         tsi.CallsQueryReq(
             project_id=client._project_id(),

diff --git a/weave-js/src/components/PagePanelComponents/Home/Browse3/feedback/FeedbackGrid.tsx b/weave-js/src/components/PagePanelComponents/Home/Browse3/feedback/FeedbackGrid.tsx
@@ -7,12 +7,12 @@ import {TargetBlank} from '../../../../../common/util/links';
 import {Alert} from '../../../../Alert';
 import {Loading} from '../../../../Loading';
 import {Tailwind} from '../../../../Tailwind';
-import {RUNNABLE_FEEDBACK_TYPE_PREFIX} from '../pages/CallPage/CallScoresViewer';
 import {Empty} from '../pages/common/Empty';
 import {useWFHooks} from '../pages/wfReactInterface/context';
 import {useGetTraceServerClientContext} from '../pages/wfReactInterface/traceServerClientContext';
 import {FeedbackGridInner} from './FeedbackGridInner';
 import {HUMAN_ANNOTATION_BASE_TYPE} from './StructuredFeedback/humanAnnotationTypes';
+import {RUNNABLE_FEEDBACK_TYPE_PREFIX} from './StructuredFeedback/runnableFeedbackTypes';
 
 const ANNOTATION_PREFIX = `${HUMAN_ANNOTATION_BASE_TYPE}.`;
 
@@ -62,7 +62,11 @@ export const FeedbackGrid = ({
     );
     // only keep the most recent feedback for each (feedback_type, creator)
     const combinedFiltered = Object.values(combined).map(
-      fs => fs.sort((a, b) => b.created_at - a.created_at)[0]
+      fs =>
+        fs.sort(
+          (a, b) =>
+            new Date(b.created_at).getTime() - new Date(a.created_at).getTime()
+        )[0]
     );
     // add the non-annotation feedback to the combined object
     combinedFiltered.push(

diff --git a/...rc/components/PagePanelComponents/Home/Browse3/feedback/HumanFeedback/tsScorerFeedback.ts b/...rc/components/PagePanelComponents/Home/Browse3/feedback/HumanFeedback/tsScorerFeedback.ts
@@ -0,0 +1,45 @@
+import {RUNNABLE_FEEDBACK_TYPE_PREFIX} from '../StructuredFeedback/runnableFeedbackTypes';
+
+export const RUNNABLE_FEEDBACK_IN_SUMMARY_PREFIX =
+  'summary.weave.feedback.' + RUNNABLE_FEEDBACK_TYPE_PREFIX;
+export const RUNNABLE_FEEDBACK_OUTPUT_PART = 'payload.output';
+
+export type ScorerFeedbackTypeParts = {
+  scorerName: string;
+  scorePath: string;
+};
+
+export const parseScorerFeedbackField = (
+  inputField: string
+): ScorerFeedbackTypeParts | null => {
+  const prefix = RUNNABLE_FEEDBACK_IN_SUMMARY_PREFIX + '.';
+  if (!inputField.startsWith(prefix)) {
+    return null;
+  }
+  const res = inputField.replace(prefix, '');
+  if (!res.includes('.')) {
+    return null;
+  }
+  const [scorerName, ...rest] = res.split('.');
+  const prefixedScorePath = rest.join('.');
+  const pathPrefix = RUNNABLE_FEEDBACK_OUTPUT_PART;
+  if (!prefixedScorePath.startsWith(pathPrefix)) {
+    return null;
+  }
+  const scorePath = prefixedScorePath.replace(pathPrefix, '');
+  return {
+    scorerName,
+    scorePath,
+  };
+};
+
+export const convertScorerFeedbackFieldToBackendFilter = (
+  field: string
+): string => {
+  const parsed = parseScorerFeedbackField(field);
+  if (parsed === null) {
+    return field;
+  }
+  const {scorerName, scorePath} = parsed;
+  return `feedback.[${RUNNABLE_FEEDBACK_TYPE_PREFIX}.${scorerName}].${RUNNABLE_FEEDBACK_OUTPUT_PART}${scorePath}`;
+};