Merge pull request #981 from vespa-engine/thomasht86/fix-integrationt…

…est-vectorsearch (ci) fix integrationtest vectorsearch
vespa-engine · Nov 22, 2024 · daa1679 · daa1679
2 parents 6bf4808 + 40995d3
commit daa1679
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 53 deletions.
diff --git a/.github/workflows/integration-cloud.yml b/.github/workflows/integration-cloud.yml
@@ -5,6 +5,8 @@ on:
   push:
     branches:
       - master
+  pull_request:
+    paths: [".github/workflows/integration-cloud.yml"]
   schedule:
     - cron: "0 11 * * 0"
 
@@ -17,16 +19,18 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
+
       - name: Set up Python
         uses: actions/setup-python@v5
+        id: setup-python
         with:
-          python-version: "3.9"
+          python-version: "3.10"
           cache: "pip"
-          cache-dependency-path: |
-            pyproject.toml
+      - run: echo '${{ steps.setup-python.outputs.cache-hit }}' # true if cache-hit occurred on the primary key
       - name: Install dependencies
         run: |
           pip install -e .[dev]
+
       - name: Run integration tests
         env:
           VESPA_TEAM_API_KEY: ${{ secrets.VESPA_TEAM_API_KEY }}
@@ -35,42 +39,44 @@ jobs:
 
   integration-cloud-token:
     runs-on: ubuntu-latest
-    needs: integration-cloud
     steps:
       - uses: actions/checkout@v4
+
       - name: Set up Python
         uses: actions/setup-python@v5
+        id: setup-python
         with:
-          python-version: "3.9"
+          python-version: "3.10"
           cache: "pip"
-          cache-dependency-path: |
-            pyproject.toml
+      - run: echo '${{ steps.setup-python.outputs.cache-hit }}' # true if cache-hit occurred on the primary key
       - name: Install dependencies
         run: |
           pip install -e .[dev]
+
       - name: Run integration tests
         env:
           VESPA_TEAM_API_KEY: ${{ secrets.VESPA_TEAM_API_KEY }}
           VESPA_CLOUD_SECRET_TOKEN: ${{ secrets.VESPA_CLOUD_SECRET_TOKEN }}
-          VESPA_CLIENT_TOKEN_ID: ${{ secrets.VESPA_CLIENT_TOKEN_ID}}
+          VESPA_CLIENT_TOKEN_ID: ${{ secrets.VESPA_CLIENT_TOKEN_ID }}
         run: |
           pytest tests/integration/test_integration_vespa_cloud_token.py -s -v
 
   integration-cloud-vector-search:
     runs-on: ubuntu-latest
-    needs: integration-cloud-token
     steps:
       - uses: actions/checkout@v4
+
       - name: Set up Python
+        id: setup-python
         uses: actions/setup-python@v5
         with:
-          python-version: "3.9"
+          python-version: "3.10"
           cache: "pip"
-          cache-dependency-path: |
-            pyproject.toml
+      - run: echo '${{ steps.setup-python.outputs.cache-hit }}' # true if cache-hit occurred on the primary key
       - name: Install dependencies
         run: |
           pip install -e .[dev]
+
       - name: Run integration tests
         env:
           VESPA_TEAM_API_KEY: ${{ secrets.VESPA_TEAM_API_KEY }}

diff --git a/tests/integration/test_integration_vespa_cloud_vector_search.py b/tests/integration/test_integration_vespa_cloud_vector_search.py
@@ -21,7 +21,6 @@
     ContainerCluster,
     Nodes,
     DeploymentConfiguration,
-    EmptyDeploymentConfiguration,
     Validation,
     ValidationID,
 )
@@ -93,7 +92,7 @@ def test_vector_indexing_and_query(self):
 
         from datasets import load_dataset
 
-        sample_size = 1000
+        sample_size = 100
         # streaming=True pages the data from S3. This is needed to avoid memory issues when loading the dataset.
         dataset = load_dataset(
             "KShivendu/dbpedia-entities-openai-1M", split="train", streaming=True
@@ -109,6 +108,10 @@ def test_vector_indexing_and_query(self):
         docs = list(
             pyvespa_feed_format
         )  # we have enough memory to page everything into memory with list()
+        # seems like we sometimes can get more than sample_size docs
+        if len(docs) > sample_size:
+            docs = docs[:sample_size]
+        self.assertEqual(len(docs), sample_size)
         ok = 0
         callbacks = 0
         start_time = time.time()
@@ -127,9 +130,6 @@ def callback(response: VespaResponse, id: str):
             schema="vector",
             namespace="benchmark",
             callback=callback,
-            max_workers=48,
-            max_connections=48,
-            max_queue_size=4000,
         )
         self.assertEqual(ok, sample_size)
         duration = time.time() - start
@@ -164,31 +164,28 @@ def callback(response: VespaResponse, id: str):
         ok = 0
         callbacks = 0
         start_time = time.time()
-        dataset = load_dataset(
-            "KShivendu/dbpedia-entities-openai-1M", split="train", streaming=True
-        ).take(100)
+
         feed_with_wrong_field = dataset.map(
             lambda x: {
                 "id": x["_id"],
                 "fields": {"id": x["_id"], "vector": x["openai"]},
             }
         )
         faulty_docs = list(feed_with_wrong_field)
+        if len(faulty_docs) > sample_size:
+            faulty_docs = faulty_docs[:sample_size]
+        self.assertEqual(len(faulty_docs), sample_size)
         self.app.feed_iterable(
             iter=faulty_docs,
             schema="vector",
             namespace="benchmark",
             callback=callback,
-            max_workers=48,
-            max_connections=48,
         )
         self.assertEqual(ok, 0)
         self.assertEqual(callbacks, 100)
 
         ok = 0
-        dataset = load_dataset(
-            "KShivendu/dbpedia-entities-openai-1M", split="train", streaming=True
-        ).take(sample_size)
+
         # Run update - assign all docs with a meta field
 
         updates = dataset.map(lambda x: {"id": x["_id"], "fields": {"meta": "stuff"}})
@@ -239,7 +236,7 @@ def tearDown(self) -> None:
 
 
 class TestProdDeploymentFromDisk(unittest.TestCase):
-    def setUp(self) -> None:
+    def test_setup(self) -> None:
         self.app_package = create_vector_ada_application_package()
         prod_region = "aws-us-east-1c"
         self.app_package.clusters = [
@@ -302,32 +299,33 @@ def test_application_status(self):
     def test_vector_indexing_and_query(self):
         super().test_vector_indexing_and_query()
 
-    @unittest.skip("Do not run when not waiting for deployment.")
-    def tearDown(self) -> None:
-        self.app.delete_all_docs(
-            content_cluster_name="vector_content",
-            schema="vector",
-            namespace="benchmark",
-        )
-        time.sleep(5)
-        with self.app.syncio() as sync_session:
-            response: VespaResponse = sync_session.query(
-                {"yql": "select id from sources * where true", "hits": 10}
-            )
-            self.assertEqual(response.get_status_code(), 200)
-            self.assertEqual(len(response.hits), 0)
-            print(response.get_json())
+    # DO NOT skip tearDown-method, as test will not exit.
+    # @unittest.skip("Do not run when not waiting for deployment.")
+    # def tearDown(self) -> None:
+    #     self.app.delete_all_docs(
+    #         content_cluster_name="vector_content",
+    #         schema="vector",
+    #         namespace="benchmark",
+    #     )
+    #     time.sleep(5)
+    #     with self.app.syncio() as sync_session:
+    #         response: VespaResponse = sync_session.query(
+    #             {"yql": "select id from sources * where true", "hits": 10}
+    #         )
+    #         self.assertEqual(response.get_status_code(), 200)
+    #         self.assertEqual(len(response.hits), 0)
+    #         print(response.get_json())
 
-        # Deployment is deleted by deploying with an empty deployment.xml file.
-        self.app_package.deployment_config = EmptyDeploymentConfiguration()
+    #     # Deployment is deleted by deploying with an empty deployment.xml file.
+    #     self.app_package.deployment_config = EmptyDeploymentConfiguration()
 
-        # Vespa won't push the deleted deployment.xml file unless we add a validation override
-        tomorrow = datetime.now() + timedelta(days=1)
-        formatted_date = tomorrow.strftime("%Y-%m-%d")
-        self.app_package.validations = [
-            Validation(ValidationID("deployment-removal"), formatted_date)
-        ]
-        self.app_package.to_files(self.application_root)
-        # This will delete the deployment
-        self.vespa_cloud._start_prod_deployment(self.application_root)
-        shutil.rmtree(self.application_root, ignore_errors=True)
+    #     # Vespa won't push the deleted deployment.xml file unless we add a validation override
+    #     tomorrow = datetime.now() + timedelta(days=1)
+    #     formatted_date = tomorrow.strftime("%Y-%m-%d")
+    #     self.app_package.validations = [
+    #         Validation(ValidationID("deployment-removal"), formatted_date)
+    #     ]
+    #     self.app_package.to_files(self.application_root)
+    #     # This will delete the deployment
+    #     self.vespa_cloud._start_prod_deployment(self.application_root)
+    #     shutil.rmtree(self.application_root, ignore_errors=True)