Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add quoted search functionality on browse sources page #1737

Open
wants to merge 8 commits into
base: develop
Choose a base branch
from
87 changes: 72 additions & 15 deletions django/cantusdb_project/main_app/tests/test_views/test_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -1000,105 +1000,162 @@ def test_filter_by_production_method(self) -> None:
self.assertNotIn(manuscript_source, sources)
self.assertIn(print_source, sources)

def test_search_by_title(self):
def test_search_by_title(self) -> None:
"""The "general search" field searches in `title`, `shelfmark`, `description`, and `summary`"""
source = make_fake_source(
shelfmark=faker.sentence(),
published=True,
)
search_term = get_random_search_term(source.shelfmark)

# Partial matching
response = self.client.get(reverse("source-list"), {"general": search_term})
self.assertIn(source, response.context["sources"])

# Exact matching
response = self.client.get(
reverse("source-list"), {"general": f'"{source.shelfmark}"'}
)
self.assertIn(source, response.context["sources"])

# Test that postgres searches unaccented version of title
unaccented_title = source.shelfmark
accented_title = add_accents_to_string(unaccented_title)
source.title = accented_title
source.save()
response = self.client.get(reverse("source-list"), {"general": search_term})
response = self.client.get(
reverse("source-list"), {"general": f'"{source.title}"'}
)
self.assertIn(source, response.context["sources"])

def test_search_by_shelfmark(self):
def test_search_by_shelfmark(self) -> None:
hinst = make_fake_institution(name="Fake Institution", siglum="FA-Ke")
source = make_fake_source(
published=True, shelfmark="title", holding_institution=hinst
)
search_term = get_random_search_term(source.shelfmark)

# Partial matching
response = self.client.get(reverse("source-list"), {"general": search_term})
self.assertIn(source, response.context["sources"])

# Exact matching
response = self.client.get(
reverse("source-list"), {"general": f'"{source.shelfmark}"'}
)
self.assertIn(source, response.context["sources"])

# Test that postgres searches unaccented version of shelfmark
unaccented_siglum = source.shelfmark
accented_siglum = add_accents_to_string(unaccented_siglum)
source.siglum = accented_siglum
source.save()
response = self.client.get(reverse("source-list"), {"general": search_term})
response = self.client.get(
reverse("source-list"), {"general": f'"{source.siglum}"'}
)
self.assertIn(source, response.context["sources"])

def test_search_by_description(self):
def test_search_by_description(self) -> None:
source = make_fake_source(
description=faker.sentence(),
published=True,
shelfmark="title",
)
search_term = get_random_search_term(source.description)

# Partial matching
response = self.client.get(reverse("source-list"), {"general": search_term})
self.assertIn(source, response.context["sources"])

# Exact matching
response = self.client.get(
reverse("source-list"), {"general": f'"{source.description}"'}
)
self.assertIn(source, response.context["sources"])

# Test that postgres searches unaccented version of description
unaccented_description = source.description
accented_description = add_accents_to_string(unaccented_description)
source.title = accented_description
source.description = accented_description
source.save()
response = self.client.get(reverse("source-list"), {"general": search_term})
response = self.client.get(
reverse("source-list"), {"general": f'"{source.description}"'}
)
self.assertIn(source, response.context["sources"])

def test_search_by_summary(self):
def test_search_by_summary(self) -> None:
source = make_fake_source(
summary=faker.sentence(),
published=True,
shelfmark="title",
)
search_term = get_random_search_term(source.summary)

# Partial matching
response = self.client.get(reverse("source-list"), {"general": search_term})
self.assertIn(source, response.context["sources"])

# Exact matching
response = self.client.get(
reverse("source-list"), {"general": f'"{source.summary}"'}
)
self.assertIn(source, response.context["sources"])

# Test that postgres searches unaccented version of summary
unaccented_summary = source.summary
accented_summary = add_accents_to_string(unaccented_summary)
source.title = accented_summary
source.summary = accented_summary
source.save()
response = self.client.get(reverse("source-list"), {"general": search_term})
response = self.client.get(
reverse("source-list"), {"general": f'"{source.summary}"'}
)
self.assertIn(source, response.context["sources"])

def test_search_by_indexing_notes(self):
def test_search_by_indexing_notes(self) -> None:
"""The "indexing notes" field searches in `indexing_notes` and indexer/editor related fields"""
source = make_fake_source(
indexing_notes=faker.sentence(),
published=True,
shelfmark="title",
)
search_term = get_random_search_term(source.indexing_notes)

# Partial matching
response = self.client.get(reverse("source-list"), {"indexing": search_term})
self.assertIn(source, response.context["sources"])

# Exact matching
response = self.client.get(
reverse("source-list"), {"indexing": f'"{source.indexing_notes}"'}
)
self.assertIn(source, response.context["sources"])

# Test that postgres searches unaccented version of indexing_notes
unaccented_indexing_notes = source.indexing_notes
accented_indexing_notes = add_accents_to_string(unaccented_indexing_notes)
source.shelfmark = accented_indexing_notes
source.indexing_notes = accented_indexing_notes
source.save()
response = self.client.get(reverse("source-list"), {"general": search_term})
response = self.client.get(
reverse("source-list"), {"indexing": f'"{source.indexing_notes}"'}
)
self.assertIn(source, response.context["sources"])

def test_search_by_name(self) -> None:
source = make_fake_source(
name=faker.sentence(), published=True, shelfmark="title"
)
# We know source.name is not None because it was just set
search_term = get_random_search_term(source.name) # type: ignore[arg-type]
search_term = get_random_search_term(source.name)

# Partial matching
response = self.client.get(reverse("source-list"), {"general": search_term})
self.assertIn(source, response.context["sources"])

# Exact matching
response = self.client.get(
reverse("source-list"), {"general": f'"{source.name}"'}
)
self.assertIn(source, response.context["sources"])

def test_ordering(self) -> None:
"""
Order is currently available by country, city + institution name (parameter:
Expand Down
54 changes: 37 additions & 17 deletions django/cantusdb_project/main_app/views/source.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from typing import Any, Optional

from django.contrib import messages
Expand Down Expand Up @@ -311,8 +312,17 @@ def get_queryset(self) -> QuerySet[Source]:
q_obj_filter &= Q(production_method=production_method)

if general_str := self.request.GET.get("general"):
# Strip spaces at the beginning and end. Then make list of terms split on spaces
general_search_terms = general_str.strip(" ").split(" ")
# Strip spaces at the beginning and end
general_str = general_str.strip()

# Use regex to extract quoted and unquoted terms
quoted_terms = re.findall(
r'"(.*?)"', general_str
) # Extract terms in quotes
unquoted_terms = re.findall(
r"\b[\w,-.]+\b", re.sub(r'"(.*?)"', "", general_str)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could also do this with one regex with look-aheads/look-behinds:

(?<!\")\b[\w,-.]+\b(?!\")

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably clearer what's happening here with the way you have it now...

)

# We need a Q Object for each field we're gonna look into
shelfmark_q = Q()
siglum_q = Q()
Expand All @@ -327,27 +337,35 @@ def get_queryset(self) -> QuerySet[Source]:
# provenance_q = Q()
summary_q = Q()

# For each term, add it to the Q object of each field with an OR operation.
# We split the terms so that the words can be separated in the actual
# field, allowing for a more flexible search, and a field needs
# to match only one of the terms
for term in general_search_terms:
holding_institution_q |= Q(holding_institution__name__icontains=term)
# Add unquoted terms to the Q object with partial matching (icontains)
for term in unquoted_terms:
holding_institution_q |= Q(
holding_institution__name__unaccent__icontains=term
)
holding_institution_city_q |= Q(
holding_institution__city__icontains=term
holding_institution__city__unaccent__icontains=term
)
shelfmark_q |= Q(shelfmark__unaccent__icontains=term)
siglum_q |= Q(holding_institution__siglum__unaccent__icontains=term)
description_q |= Q(description__unaccent__icontains=term)
summary_q |= Q(summary__unaccent__icontains=term)
name_q |= Q(name__icontains=term)
# provenance_q |= Q(provenance__name__icontains=term)
# All the Q objects are put together with OR.
# The end result is that at least one term has to match in at least one
# field
# general_search_q = (
# title_q | siglum_q | description_q | provenance_q
# )
name_q |= Q(name__unaccent__icontains=term)

# Add quoted terms to the Q object with exact matching (iexact)
for term in quoted_terms:
holding_institution_q |= Q(
holding_institution__name__unaccent__iexact=term
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think iexact is correct here...

If someone searches for "North Vancouver", don't I want any results where "North Vancouver" shows up somewhere in one of these fields, not just results where a field is exactly "North Vancouver"?

I think contains is probably still the method, it's just we're not searching single words, but sometimes groups of words depending on the quotes.

)
holding_institution_city_q |= Q(
holding_institution__city__unaccent__iexact=term
)
shelfmark_q |= Q(shelfmark__unaccent__iexact=term)
siglum_q |= Q(holding_institution__siglum__unaccent__iexact=term)
description_q |= Q(description__unaccent__iexact=term)
summary_q |= Q(summary__unaccent__iexact=term)
name_q |= Q(name__unaccent__iexact=term)

# Combine all Q objects with OR
general_search_q = (
shelfmark_q
| siglum_q
Expand All @@ -357,6 +375,8 @@ def get_queryset(self) -> QuerySet[Source]:
| holding_institution_city_q
| name_q
)

# Apply the general search Q object to the filter
q_obj_filter &= general_search_q

# For the indexing notes search we follow the same procedure as above but with
Expand Down
Loading