Skip to content

Commit

Permalink
Merge branch 'master' into bedrock-integration
Browse files Browse the repository at this point in the history
  • Loading branch information
J2-D2-3PO authored Jan 9, 2025
2 parents bf7214a + 93fd64f commit 7cd5386
Show file tree
Hide file tree
Showing 20 changed files with 843 additions and 464 deletions.
28 changes: 28 additions & 0 deletions docs/docs/guides/core-types/env-vars.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Environment variables

Weave provides a set of environment variables to configure and optimize its behavior. You can set these variables in your shell or within scripts to control specific functionality.

```bash
# Example of setting environment variables in the shell
WEAVE_PARALLELISM=10 # Controls the number of parallel workers
WEAVE_PRINT_CALL_LINK=false # Disables call link output
```

```python
# Example of setting environment variables in Python
import os

os.environ["WEAVE_PARALLELISM"] = "10"
os.environ["WEAVE_PRINT_CALL_LINK"] = "false"
```

## Environment variables reference

| Variable Name | Description |
|--------------------------|-----------------------------------------------------------------|
| WEAVE_CAPTURE_CODE | Disable code capture for `weave.op` if set to `false`. |
| WEAVE_DEBUG_HTTP | If set to `1`, turns on HTTP request and response logging for debugging. |
| WEAVE_DISABLED | If set to `true`, all tracing to Weave is disabled. |
| WEAVE_PARALLELISM | In evaluations, the number of examples to evaluate in parallel. `1` runs examples sequentially. Default value is `20`. |
| WEAVE_PRINT_CALL_LINK | If set to `false`, call URL printing is suppressed. Default value is `false`. |
| WEAVE_TRACE_LANGCHAIN | When set to `false`, explicitly disable global tracing for LangChain. | |
1 change: 1 addition & 0 deletions docs/sidebars.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ const sidebars: SidebarsConfig = {
"guides/tools/comparison",
"guides/tools/playground",
"guides/core-types/media",
"guides/core-types/env-vars",
{
type: "category",
collapsible: true,
Expand Down
211 changes: 211 additions & 0 deletions tests/trace/test_call_apply_scorer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
from __future__ import annotations

import pytest

import weave
from weave.scorers.base_scorer import ApplyScorerResult
from weave.trace.op import OpCallError
from weave.trace.refs import CallRef
from weave.trace.weave_client import Call, Op, WeaveClient


def do_assertions_for_scorer_op(
apply_score_res: ApplyScorerResult,
call: Call,
score_fn: Op | weave.Scorer,
client: WeaveClient,
):
assert apply_score_res.score_call.id is not None
assert apply_score_res.result == 0

feedbacks = list(call.feedback)
assert len(feedbacks) == 1
target_feedback = feedbacks[0]
scorer_name = (
score_fn.name if isinstance(score_fn, Op) else score_fn.__class__.__name__
)
assert target_feedback.feedback_type == "wandb.runnable." + scorer_name
assert target_feedback.runnable_ref == score_fn.ref.uri()
assert (
target_feedback.call_ref
== CallRef(
entity=client.entity,
project=client.project,
id=apply_score_res.score_call.id,
).uri()
)
assert target_feedback.payload == {"output": apply_score_res.result}


@pytest.mark.asyncio
async def test_scorer_op_no_context(client: WeaveClient):
@weave.op
def predict(x):
return x + 1

@weave.op
def score_fn(x, output):
return output - x - 1

_, call = predict.call(1)
apply_score_res = await call.apply_scorer(score_fn)
do_assertions_for_scorer_op(apply_score_res, call, score_fn, client)

@weave.op
def score_fn_with_incorrect_args(y, output):
return output - y

with pytest.raises(OpCallError):
apply_score_res = await call.apply_scorer(score_fn_with_incorrect_args)


@pytest.mark.asyncio
async def test_scorer_op_with_context(client: WeaveClient):
@weave.op
def predict(x):
return x + 1

@weave.op
def score_fn(x, output, correct_answer):
return output - correct_answer

_, call = predict.call(1)
apply_score_res = await call.apply_scorer(
score_fn, additional_scorer_kwargs={"correct_answer": 2}
)
do_assertions_for_scorer_op(apply_score_res, call, score_fn, client)

@weave.op
def score_fn_with_incorrect_args(x, output, incorrect_arg):
return output - incorrect_arg

with pytest.raises(OpCallError):
apply_score_res = await call.apply_scorer(
score_fn_with_incorrect_args, additional_scorer_kwargs={"correct_answer": 2}
)


@pytest.mark.asyncio
async def test_async_scorer_op(client: WeaveClient):
@weave.op
def predict(x):
return x + 1

@weave.op
async def score_fn(x, output):
return output - x - 1

_, call = predict.call(1)
apply_score_res = await call.apply_scorer(score_fn)
do_assertions_for_scorer_op(apply_score_res, call, score_fn, client)

@weave.op
async def score_fn_with_incorrect_args(y, output):
return output - y

with pytest.raises(OpCallError):
apply_score_res = await call.apply_scorer(score_fn_with_incorrect_args)


@pytest.mark.asyncio
async def test_scorer_obj_no_context(client: WeaveClient):
@weave.op
def predict(x):
return x + 1

class MyScorer(weave.Scorer):
offset: int

@weave.op
def score(self, x, output):
return output - x - self.offset

scorer = MyScorer(offset=1)

_, call = predict.call(1)
apply_score_res = await call.apply_scorer(scorer)
do_assertions_for_scorer_op(apply_score_res, call, scorer, client)

class MyScorerWithIncorrectArgs(weave.Scorer):
offset: int

@weave.op
def score(self, y, output):
return output - y - self.offset

with pytest.raises(OpCallError):
apply_score_res = await call.apply_scorer(MyScorerWithIncorrectArgs(offset=1))


@pytest.mark.asyncio
async def test_scorer_obj_with_context(client: WeaveClient):
@weave.op
def predict(x):
return x + 1

class MyScorer(weave.Scorer):
offset: int

@weave.op
def score(self, x, output, correct_answer):
return output - correct_answer - self.offset

scorer = MyScorer(offset=0)

_, call = predict.call(1)
apply_score_res = await call.apply_scorer(
scorer, additional_scorer_kwargs={"correct_answer": 2}
)
do_assertions_for_scorer_op(apply_score_res, call, scorer, client)

class MyScorerWithIncorrectArgs(weave.Scorer):
offset: int

@weave.op
def score(self, y, output, incorrect_arg):
return output - incorrect_arg - self.offset

with pytest.raises(OpCallError):
apply_score_res = await call.apply_scorer(
MyScorerWithIncorrectArgs(offset=0),
additional_scorer_kwargs={"incorrect_arg": 2},
)

class MyScorerWithIncorrectArgsButCorrectColumnMapping(weave.Scorer):
offset: int

@weave.op
def score(self, y, output, incorrect_arg):
return output - incorrect_arg - self.offset

scorer = MyScorerWithIncorrectArgsButCorrectColumnMapping(
offset=0, column_map={"y": "x", "incorrect_arg": "correct_answer"}
)

_, call = predict.call(1)
apply_score_res = await call.apply_scorer(
scorer, additional_scorer_kwargs={"correct_answer": 2}
)
do_assertions_for_scorer_op(apply_score_res, call, scorer, client)


@pytest.mark.asyncio
async def test_async_scorer_obj(client: WeaveClient):
@weave.op
def predict(x):
return x + 1

class MyScorer(weave.Scorer):
offset: int

@weave.op
async def score(self, x, output):
return output - x - 1

scorer = MyScorer(offset=0)

_, call = predict.call(1)
apply_score_res = await call.apply_scorer(
scorer, additional_scorer_kwargs={"correct_answer": 2}
)
do_assertions_for_scorer_op(apply_score_res, call, scorer, client)
19 changes: 11 additions & 8 deletions tests/trace/test_feedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ def test_runnable_feedback(client: WeaveClient) -> None:
}


def populate_feedback(client: WeaveClient) -> None:
async def populate_feedback(client: WeaveClient) -> None:
@weave.op
def my_scorer(x: int, output: int) -> int:
expected = ["a", "b", "c", "d"][x]
Expand All @@ -369,21 +369,22 @@ def my_model(x: int) -> str:
for x in range(4):
_, c = my_model.call(x)
ids.append(c.id)
c._apply_scorer(my_scorer)
await c.apply_scorer(my_scorer)

assert len(list(my_scorer.calls())) == 4
assert len(list(my_model.calls())) == 4

return ids, my_scorer, my_model


def test_sort_by_feedback(client: WeaveClient) -> None:
@pytest.mark.asyncio
async def test_sort_by_feedback(client: WeaveClient) -> None:
if client_is_sqlite(client):
# Not implemented in sqlite - skip
return pytest.skip()

"""Test sorting by feedback."""
ids, my_scorer, my_model = populate_feedback(client)
ids, my_scorer, my_model = await populate_feedback(client)

for fields, asc_ids in [
(
Expand Down Expand Up @@ -441,13 +442,14 @@ def test_sort_by_feedback(client: WeaveClient) -> None:
), f"Sorting by {fields} descending failed, expected {asc_ids[::-1]}, got {found_ids}"


def test_filter_by_feedback(client: WeaveClient) -> None:
@pytest.mark.asyncio
async def test_filter_by_feedback(client: WeaveClient) -> None:
if client_is_sqlite(client):
# Not implemented in sqlite - skip
return pytest.skip()

"""Test filtering by feedback."""
ids, my_scorer, my_model = populate_feedback(client)
ids, my_scorer, my_model = await populate_feedback(client)
for field, value, eq_ids, gt_ids in [
(
"feedback.[wandb.runnable.my_scorer].payload.output.model_output",
Expand Down Expand Up @@ -514,13 +516,14 @@ def __eq__(self, other):
return isinstance(other, datetime.datetime)


def test_filter_and_sort_by_feedback(client: WeaveClient) -> None:
@pytest.mark.asyncio
async def test_filter_and_sort_by_feedback(client: WeaveClient) -> None:
if client_is_sqlite(client):
# Not implemented in sqlite - skip
return pytest.skip()

"""Test filtering by feedback."""
ids, my_scorer, my_model = populate_feedback(client)
ids, my_scorer, my_model = await populate_feedback(client)
calls = client.server.calls_query_stream(
tsi.CallsQueryReq(
project_id=client._project_id(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@ import {TargetBlank} from '../../../../../common/util/links';
import {Alert} from '../../../../Alert';
import {Loading} from '../../../../Loading';
import {Tailwind} from '../../../../Tailwind';
import {RUNNABLE_FEEDBACK_TYPE_PREFIX} from '../pages/CallPage/CallScoresViewer';
import {Empty} from '../pages/common/Empty';
import {useWFHooks} from '../pages/wfReactInterface/context';
import {useGetTraceServerClientContext} from '../pages/wfReactInterface/traceServerClientContext';
import {FeedbackGridInner} from './FeedbackGridInner';
import {HUMAN_ANNOTATION_BASE_TYPE} from './StructuredFeedback/humanAnnotationTypes';
import {RUNNABLE_FEEDBACK_TYPE_PREFIX} from './StructuredFeedback/runnableFeedbackTypes';

const ANNOTATION_PREFIX = `${HUMAN_ANNOTATION_BASE_TYPE}.`;

Expand Down Expand Up @@ -62,7 +62,11 @@ export const FeedbackGrid = ({
);
// only keep the most recent feedback for each (feedback_type, creator)
const combinedFiltered = Object.values(combined).map(
fs => fs.sort((a, b) => b.created_at - a.created_at)[0]
fs =>
fs.sort(
(a, b) =>
new Date(b.created_at).getTime() - new Date(a.created_at).getTime()
)[0]
);
// add the non-annotation feedback to the combined object
combinedFiltered.push(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import {RUNNABLE_FEEDBACK_TYPE_PREFIX} from '../StructuredFeedback/runnableFeedbackTypes';

export const RUNNABLE_FEEDBACK_IN_SUMMARY_PREFIX =
'summary.weave.feedback.' + RUNNABLE_FEEDBACK_TYPE_PREFIX;
export const RUNNABLE_FEEDBACK_OUTPUT_PART = 'payload.output';

export type ScorerFeedbackTypeParts = {
scorerName: string;
scorePath: string;
};

export const parseScorerFeedbackField = (
inputField: string
): ScorerFeedbackTypeParts | null => {
const prefix = RUNNABLE_FEEDBACK_IN_SUMMARY_PREFIX + '.';
if (!inputField.startsWith(prefix)) {
return null;
}
const res = inputField.replace(prefix, '');
if (!res.includes('.')) {
return null;
}
const [scorerName, ...rest] = res.split('.');
const prefixedScorePath = rest.join('.');
const pathPrefix = RUNNABLE_FEEDBACK_OUTPUT_PART;
if (!prefixedScorePath.startsWith(pathPrefix)) {
return null;
}
const scorePath = prefixedScorePath.replace(pathPrefix, '');
return {
scorerName,
scorePath,
};
};

export const convertScorerFeedbackFieldToBackendFilter = (
field: string
): string => {
const parsed = parseScorerFeedbackField(field);
if (parsed === null) {
return field;
}
const {scorerName, scorePath} = parsed;
return `feedback.[${RUNNABLE_FEEDBACK_TYPE_PREFIX}.${scorerName}].${RUNNABLE_FEEDBACK_OUTPUT_PART}${scorePath}`;
};
Loading

0 comments on commit 7cd5386

Please sign in to comment.