Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BeautifulSoup logic in separate file #56

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
45 changes: 45 additions & 0 deletions smarsy/bs_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from bs4 import BeautifulSoup


class BSHelper(object):
"""
The help class for BeautifulSoup library
"""
def __init__(self, html):
self.html = html

@property
def bs_object(self):
"""
Utility funtcion:
- Accepts html and checks its validity using BeautifulSoup library,
return BS object or False
"""
try:
soup = BeautifulSoup(self.html, 'html.parser')
except TypeError:
return False
return soup

def bs_safe_select(self, html, *args):
"""
Utility function used to get a content string from a
HTML and tuple of selectors. Returns False
if no object is found for the given selector
"""
for arg in args:
selectedElems = html.select_one(arg)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

won't it always take the last output of the select_one ? It's not adding, for every iteration it re-assigns selectedElems with the new value. No? or is it expected?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is what is expected. Each new iteration overrides a variable selectedElems

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then I don't get it, does it supposed to return:

  • all objects for all found selectors?
  • the last found object?
  • the first found object?
  • any object?
  • is it expected to be some kind of chaining action, when the result of the previous iteration is used in the next one?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then I don't get it, does it supposed to return:

we take the object, apply the method select_one with selector 1 to it, then apply the method select_one with selector 2 to the received object, then apply the method select_one with the selector X to the received object and return the object or False

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you please provide real example from smarsy website? expected call with expected result?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From issue#51
<TD valign=top align="left" width="120"><img src="https://smarsy.ua/images/mypage/parent_1.png"></TD>
We must find td with valign=top and in received object find img[src]

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and what would be the function call for that html?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

html.select_one([valign=top]).select_one('img[src]')

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will it work? Additionally here you are passing single value parameter, but in your function you are expecting array. Please provide an example with array

if selectedElems is not None:
return selectedElems
return False

def bs_safe_get(self, html, attribute):
"""
Utility function used to get a content string from a
HTML and attribute. Returns False
if no object is found for the given selector
"""
element = html.get(attribute)
if element is not None:
return element
return False
92 changes: 92 additions & 0 deletions tests/test_bs_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import unittest
import sys
import os

from unittest.mock import patch, PropertyMock

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
'..')))
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
'..', 'smarsy')))
# excluding following line for linter as it complains that
# from import is supposed to be at the top of the file

from smarsy.bs_helper import BSHelper # noqa


class TestBSHelperInstance(unittest.TestCase):
def test_bshelper_instance_created(self):
html = 'some html'
source_page = BSHelper(html)
self.assertEqual(source_page.html, html)


class Test_bs_object(unittest.TestCase):
@patch('smarsy.bs_helper.BeautifulSoup', new_callable=PropertyMock)
def test_bs_object_called_with_expected_html(self, mocked_soup):
html = '<tr></tr>'
source_page = BSHelper(html).bs_object
mocked_soup.assert_called_with(html, 'html.parser')

@patch('smarsy.bs_helper.BeautifulSoup', side_effect=TypeError)
def test_bs_object_return_false_with_unexpected_html(
self, mocked_soup):
source_page = BSHelper(12345)
self.assertFalse(source_page.bs_object)


class Test_bs_safe_select(unittest.TestCase):
@patch('smarsy.bs_helper.BeautifulSoup')
def setUp(self, mocked_soup):
self.source_page = BSHelper('some html')
self.mocked_soup = mocked_soup
self.mocked_soup.select_one.return_value = 'some text'
self.selector = 'some_tag'
self.selectors = 'some_tag1', 'some_tag2', 'some_tag3'
self.select_one_values = ('some text1', 'some text2', 'some text3')
self.expected = 'some text'

def test_bs_safe_select_return_expected_text_with_single_selector(self):
actual = self.source_page.bs_safe_select(self.mocked_soup,
self.selector)
self.assertEqual(actual, self.expected)

def test_bs_safe_select_return_expected_text_with_many_selectors(self):
select_one = None
for select_one_value in self.select_one_values:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the test is re-producing the same logic as in the function that is not good. This test is bad, but before answering what exactly is bad you need to answer the question posted under this function's source code.

select_one = select_one_value
self.mocked_soup.select_one.return_value = select_one
actual = self.source_page.bs_safe_select(self.mocked_soup,
self.selectors)
self.assertEqual(actual, select_one)
dkultasev marked this conversation as resolved.
Show resolved Hide resolved

def test_bs_safe_select_return_false_when_no_object_is_found(
self):
self.mocked_soup.select_one.return_value = ''
self.assertFalse(self.source_page.bs_safe_select(self.mocked_soup,
self.selector))


class Test_bs_safe_get(unittest.TestCase):
@patch('smarsy.bs_helper.BeautifulSoup')
def setUp(self, mocked_soup):
self.source_page = BSHelper('some html')
self.mocked_soup = mocked_soup
self.expected_text = 'some text'
self.expected_attribute = 'some attribute'
self.mocked_soup.get.return_value = 'some text'

def test_bs_get_called_with_expected_html_and_attribute(self):
self.source_page.bs_safe_get(self.mocked_soup, self.expected_attribute)
self.mocked_soup.get.assert_called_with(self.expected_attribute)

def test_bs_safe_get_return_false_when_element_is_empty(
self):
self.mocked_soup.get.return_value = ''
self.assertFalse(self.source_page.bs_safe_get(self.mocked_soup,
self.expected_attribute))

def test_bs_safe_get_return_expected_text(self):
actual = self.source_page.bs_safe_get(self.mocked_soup,
self.expected_attribute)
self.assertEqual(actual, self.expected_text)