Skip to content

Commit

Permalink
Merge pull request #27 from NikitaKozlovtcev/feature/NV-8056-urls-ignor
Browse files Browse the repository at this point in the history
[NV-8056] added ignoring urls to SpellCheckService.prepare
  • Loading branch information
xfenix authored Jul 3, 2024
2 parents 42c5dbf + 4541a69 commit 3367835
Show file tree
Hide file tree
Showing 10 changed files with 287 additions and 181 deletions.
405 changes: 227 additions & 178 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ sentry-sdk = "*"
pydantic-settings = "*"
fastapi = "*"
structlog = "*"
urlextract = "*"

[tool.poetry.group.dev.dependencies]
httpx = "*"
Expand Down
5 changes: 3 additions & 2 deletions scripts/__main__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python3
"""Simple dockerhub readme generator."""

import argparse
import pathlib
import re
Expand All @@ -19,7 +20,7 @@ def _update_dockerhub_readme() -> None:
r"\#\# Development.*",
r"",
README_PATH.read_text(),
flags=re.I | re.S,
flags=re.IGNORECASE | re.DOTALL,
).strip()
new_content = replace_tag_in_readme(new_content, parse_last_git_tag())
README_PATH.write_text(new_content + "\n")
Expand Down Expand Up @@ -58,7 +59,7 @@ def _update_readme() -> None:
r"(.*Here is a list of them\:).*?(\#\#\#\s.*)",
r"\1\n" + automatic_config_readme + r"\n\n\2",
new_content,
flags=re.I | re.M | re.S,
flags=re.IGNORECASE | re.MULTILINE | re.DOTALL,
)
new_content = replace_tag_in_readme(new_content, parse_last_git_tag())
README_PATH.write_text(new_content)
Expand Down
2 changes: 1 addition & 1 deletion scripts/_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,5 @@ def replace_tag_in_readme(readme_text: str, new_tag: str) -> str:
r"(xfenix/spellcheck-microservice\:)(\d{1,}\.\d{1,}\.\d{1,})",
r"\g<1>" + new_tag,
readme_text,
flags=re.I | re.S,
flags=re.IGNORECASE | re.DOTALL,
)
7 changes: 7 additions & 0 deletions tests/_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
We do not want to parse files for tests — it's just waste of time.
"""

import typing


Expand All @@ -11,3 +12,9 @@
""",
"""Апичатки — настаящая граза фсякага блохера. Это палнаценный ужос в текздах. Так жидь нельзйа""",
)

COMMON_TEXT_MESSAGE: typing.Final[str] = (
"Коллеги из поддержки юридических лиц работают в чате по будням с 6:00 до 22:00 по Москве.\n" # noqa: RUF001
"Напишите в рабочее время или позвоните 8(800)700-46-46 по будням с 6:00 до 22:00 суббота с 9:00 по 18:00.\n" # noqa: RUF001
"{} \nЕсли хотите, я передам вопрос, и вам напишут в рабочее время." # noqa: RUF001
)
32 changes: 32 additions & 0 deletions tests/test_spell.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import pytest

from tests._fixtures import COMMON_TEXT_MESSAGE
from whole_app import models
from whole_app.spell import SpellCheckService

Expand All @@ -12,3 +15,32 @@ def test_correct_spell() -> None:
# a тут надо проверять, что first_position и last_position корректные, что word соответствует слову из text
# что в corrections есть правильные варианты (в рандомизированном случае можно такое не проверять)
# важно: нужно ВРУЧНУЮ подбирать first_position, last_position и правильные слова и вручную вносить сюда


@pytest.mark.parametrize(
"url",
[
"www.rzb.ru",
"https://rzb.ru",
"https://www.rzb.ru",
"rzb.ru/taCWpO",
"www.rzb.ru/taCWpO",
"https://rzb.ru/taCWpO",
"https://www.rzb.ru/taCWpO",
"https://www.asd.google.com/search?q=some+text&param=3#dfsdf",
"https://www.google.com",
"http://google.com/?q=some+text&param=3#dfsdf",
"https://www.google.com/api/?",
"https://www.google.com/api/login.php",
"https://r-chat.raiffeisen.ru/admin/operator/",
"https://r-chat.raiffeisen.ru/admin/operator/taCWpO",
],
)
def test_urls_ignored(
url: str,
) -> None:
fake_engine: SpellCheckService = SpellCheckService()
corrections = fake_engine.prepare(
models.SpellCheckRequest(text=COMMON_TEXT_MESSAGE.format(url), language="ru_RU", exclude_urls=True),
).run_check()
assert not corrections
1 change: 1 addition & 0 deletions whole_app/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
This file meant only for basic workers wrappers and fastapi exposure.
For end-points look in views.py
"""

import typing

import fastapi
Expand Down
2 changes: 2 additions & 0 deletions whole_app/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# pylint: disable=no-member
"""Models for input/output."""

import typing

import pydantic
Expand All @@ -24,6 +25,7 @@ class SpellCheckRequest(pydantic.BaseModel):
min_length=SETTINGS.username_min_length,
max_length=SETTINGS.username_max_length,
)
exclude_urls: bool = True


class SpellCheckResponse(pydantic.BaseModel):
Expand Down
12 changes: 12 additions & 0 deletions whole_app/spell.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import re
import typing

import pylru
import urlextract
from enchant.checker import SpellChecker

from . import models
Expand All @@ -11,12 +13,15 @@
pylru.lrucache(SETTINGS.cache_size) if SETTINGS.cache_size > 0 else {}
)

SEPARATORS_TO_SPLIT_URL_BY_WORDS: typing.Final[re.Pattern[str]] = re.compile(r"\.|\:|\/\/|\/|\?|\&|\=|\+|\#|\-")


class SpellCheckService:
__slots__ = ("_input_text", "_spellcheck_engine", "_exclusion_words")
_input_text: str
_spellcheck_engine: SpellChecker
_exclusion_words: list[str]
_url_extractor: urlextract.URLExtract = urlextract.URLExtract()

def prepare(
self: "SpellCheckService",
Expand All @@ -26,6 +31,13 @@ def prepare(
"""Initialize machinery."""
self._input_text = request_payload.text
self._exclusion_words = exclusion_words if exclusion_words else []

if request_payload.exclude_urls:
for one_url in self._url_extractor.find_urls(self._input_text):
self._exclusion_words.extend(
[word.lower() for word in re.split(SEPARATORS_TO_SPLIT_URL_BY_WORDS, one_url)]
)

self._spellcheck_engine = SpellChecker(request_payload.language)
return self

Expand Down
1 change: 1 addition & 0 deletions whole_app/views.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""All project end-points lie here."""

import typing

import fastapi
Expand Down

0 comments on commit 3367835

Please sign in to comment.