Skip to content

Commit

Permalink
Merge pull request #981 from vespa-engine/thomasht86/fix-integrationt…
Browse files Browse the repository at this point in the history
…est-vectorsearch

(ci) fix integrationtest vectorsearch
  • Loading branch information
thomasht86 authored Nov 22, 2024
2 parents 6bf4808 + 40995d3 commit daa1679
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 53 deletions.
30 changes: 18 additions & 12 deletions .github/workflows/integration-cloud.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ on:
push:
branches:
- master
pull_request:
paths: [".github/workflows/integration-cloud.yml"]
schedule:
- cron: "0 11 * * 0"

Expand All @@ -17,16 +19,18 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
id: setup-python
with:
python-version: "3.9"
python-version: "3.10"
cache: "pip"
cache-dependency-path: |
pyproject.toml
- run: echo '${{ steps.setup-python.outputs.cache-hit }}' # true if cache-hit occurred on the primary key
- name: Install dependencies
run: |
pip install -e .[dev]
- name: Run integration tests
env:
VESPA_TEAM_API_KEY: ${{ secrets.VESPA_TEAM_API_KEY }}
Expand All @@ -35,42 +39,44 @@ jobs:
integration-cloud-token:
runs-on: ubuntu-latest
needs: integration-cloud
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
id: setup-python
with:
python-version: "3.9"
python-version: "3.10"
cache: "pip"
cache-dependency-path: |
pyproject.toml
- run: echo '${{ steps.setup-python.outputs.cache-hit }}' # true if cache-hit occurred on the primary key
- name: Install dependencies
run: |
pip install -e .[dev]
- name: Run integration tests
env:
VESPA_TEAM_API_KEY: ${{ secrets.VESPA_TEAM_API_KEY }}
VESPA_CLOUD_SECRET_TOKEN: ${{ secrets.VESPA_CLOUD_SECRET_TOKEN }}
VESPA_CLIENT_TOKEN_ID: ${{ secrets.VESPA_CLIENT_TOKEN_ID}}
VESPA_CLIENT_TOKEN_ID: ${{ secrets.VESPA_CLIENT_TOKEN_ID }}
run: |
pytest tests/integration/test_integration_vespa_cloud_token.py -s -v
integration-cloud-vector-search:
runs-on: ubuntu-latest
needs: integration-cloud-token
steps:
- uses: actions/checkout@v4

- name: Set up Python
id: setup-python
uses: actions/setup-python@v5
with:
python-version: "3.9"
python-version: "3.10"
cache: "pip"
cache-dependency-path: |
pyproject.toml
- run: echo '${{ steps.setup-python.outputs.cache-hit }}' # true if cache-hit occurred on the primary key
- name: Install dependencies
run: |
pip install -e .[dev]
- name: Run integration tests
env:
VESPA_TEAM_API_KEY: ${{ secrets.VESPA_TEAM_API_KEY }}
Expand Down
80 changes: 39 additions & 41 deletions tests/integration/test_integration_vespa_cloud_vector_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
ContainerCluster,
Nodes,
DeploymentConfiguration,
EmptyDeploymentConfiguration,
Validation,
ValidationID,
)
Expand Down Expand Up @@ -93,7 +92,7 @@ def test_vector_indexing_and_query(self):

from datasets import load_dataset

sample_size = 1000
sample_size = 100
# streaming=True pages the data from S3. This is needed to avoid memory issues when loading the dataset.
dataset = load_dataset(
"KShivendu/dbpedia-entities-openai-1M", split="train", streaming=True
Expand All @@ -109,6 +108,10 @@ def test_vector_indexing_and_query(self):
docs = list(
pyvespa_feed_format
) # we have enough memory to page everything into memory with list()
# seems like we sometimes can get more than sample_size docs
if len(docs) > sample_size:
docs = docs[:sample_size]
self.assertEqual(len(docs), sample_size)
ok = 0
callbacks = 0
start_time = time.time()
Expand All @@ -127,9 +130,6 @@ def callback(response: VespaResponse, id: str):
schema="vector",
namespace="benchmark",
callback=callback,
max_workers=48,
max_connections=48,
max_queue_size=4000,
)
self.assertEqual(ok, sample_size)
duration = time.time() - start
Expand Down Expand Up @@ -164,31 +164,28 @@ def callback(response: VespaResponse, id: str):
ok = 0
callbacks = 0
start_time = time.time()
dataset = load_dataset(
"KShivendu/dbpedia-entities-openai-1M", split="train", streaming=True
).take(100)

feed_with_wrong_field = dataset.map(
lambda x: {
"id": x["_id"],
"fields": {"id": x["_id"], "vector": x["openai"]},
}
)
faulty_docs = list(feed_with_wrong_field)
if len(faulty_docs) > sample_size:
faulty_docs = faulty_docs[:sample_size]
self.assertEqual(len(faulty_docs), sample_size)
self.app.feed_iterable(
iter=faulty_docs,
schema="vector",
namespace="benchmark",
callback=callback,
max_workers=48,
max_connections=48,
)
self.assertEqual(ok, 0)
self.assertEqual(callbacks, 100)

ok = 0
dataset = load_dataset(
"KShivendu/dbpedia-entities-openai-1M", split="train", streaming=True
).take(sample_size)

# Run update - assign all docs with a meta field

updates = dataset.map(lambda x: {"id": x["_id"], "fields": {"meta": "stuff"}})
Expand Down Expand Up @@ -239,7 +236,7 @@ def tearDown(self) -> None:


class TestProdDeploymentFromDisk(unittest.TestCase):
def setUp(self) -> None:
def test_setup(self) -> None:
self.app_package = create_vector_ada_application_package()
prod_region = "aws-us-east-1c"
self.app_package.clusters = [
Expand Down Expand Up @@ -302,32 +299,33 @@ def test_application_status(self):
def test_vector_indexing_and_query(self):
super().test_vector_indexing_and_query()

@unittest.skip("Do not run when not waiting for deployment.")
def tearDown(self) -> None:
self.app.delete_all_docs(
content_cluster_name="vector_content",
schema="vector",
namespace="benchmark",
)
time.sleep(5)
with self.app.syncio() as sync_session:
response: VespaResponse = sync_session.query(
{"yql": "select id from sources * where true", "hits": 10}
)
self.assertEqual(response.get_status_code(), 200)
self.assertEqual(len(response.hits), 0)
print(response.get_json())
# DO NOT skip tearDown-method, as test will not exit.
# @unittest.skip("Do not run when not waiting for deployment.")
# def tearDown(self) -> None:
# self.app.delete_all_docs(
# content_cluster_name="vector_content",
# schema="vector",
# namespace="benchmark",
# )
# time.sleep(5)
# with self.app.syncio() as sync_session:
# response: VespaResponse = sync_session.query(
# {"yql": "select id from sources * where true", "hits": 10}
# )
# self.assertEqual(response.get_status_code(), 200)
# self.assertEqual(len(response.hits), 0)
# print(response.get_json())

# Deployment is deleted by deploying with an empty deployment.xml file.
self.app_package.deployment_config = EmptyDeploymentConfiguration()
# # Deployment is deleted by deploying with an empty deployment.xml file.
# self.app_package.deployment_config = EmptyDeploymentConfiguration()

# Vespa won't push the deleted deployment.xml file unless we add a validation override
tomorrow = datetime.now() + timedelta(days=1)
formatted_date = tomorrow.strftime("%Y-%m-%d")
self.app_package.validations = [
Validation(ValidationID("deployment-removal"), formatted_date)
]
self.app_package.to_files(self.application_root)
# This will delete the deployment
self.vespa_cloud._start_prod_deployment(self.application_root)
shutil.rmtree(self.application_root, ignore_errors=True)
# # Vespa won't push the deleted deployment.xml file unless we add a validation override
# tomorrow = datetime.now() + timedelta(days=1)
# formatted_date = tomorrow.strftime("%Y-%m-%d")
# self.app_package.validations = [
# Validation(ValidationID("deployment-removal"), formatted_date)
# ]
# self.app_package.to_files(self.application_root)
# # This will delete the deployment
# self.vespa_cloud._start_prod_deployment(self.application_root)
# shutil.rmtree(self.application_root, ignore_errors=True)

0 comments on commit daa1679

Please sign in to comment.