Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scrape Salaries of a Team's Players for a Season #185

Open
wants to merge 7 commits into
base: v4
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,14 @@ from basketball_reference_web_scraper.data import Team

## API

This client has seven methods
This client has eight methods
* Getting player box scores by a date (`client.player_box_scores`)
* Getting team box scores by a date (`client.team_box_scores`)
* Getting the schedule for a season (`client.season_schedule`)
* Getting players totals for a season (`client.players_season_totals`)
* Getting players advanced season statistics for a season (`client.players_advanced_season_totals`)
* Getting regular season box scores for a given player and season (`client.regular_season_player_box_scores`)
* Getting the salaries of players of a team for a season (`client.team_salaries`)
* Searching (`client.search`)

You can see all methods used in [this `repl`]()https://repl.it/@jaebradley/v300api-examples).
Expand Down Expand Up @@ -179,6 +180,21 @@ The `player_identifier` is Basketball Reference's unique identifier for each pla
his `player_identifier` is `westbru01` (you can see this from his player page URL:
`https://www.basketball-reference.com/players/w/westbru01/gamelog/2020`)

### Get salary data for a team in a particular season

```python
from basketball_reference_web_scraper import client
from basketball_reference.data import Team

# Get salaries of all the players on the 1997-1998 Bulls team
client.team_salaries(
team=Team.CHICAGO_BULLS,
1998
)

# The team_salaries method supports all output behavior previously described
```

### Search

```python
Expand Down
24 changes: 23 additions & 1 deletion basketball_reference_web_scraper/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from basketball_reference_web_scraper.writers import CSVWriter, RowFormatter, \
BOX_SCORE_COLUMN_NAMES, SCHEDULE_COLUMN_NAMES, PLAYER_SEASON_TOTALS_COLUMN_NAMES, \
PLAYER_ADVANCED_SEASON_TOTALS_COLUMN_NAMES, TEAM_BOX_SCORES_COLUMN_NAMES, PLAY_BY_PLAY_COLUMN_NAMES, \
PLAYER_SEASON_BOX_SCORE_COLUMN_NAMES, SearchResultsCSVWriter, SEARCH_RESULTS_COLUMN_NAMES
PLAYER_SEASON_BOX_SCORE_COLUMN_NAMES, SearchResultsCSVWriter, SEARCH_RESULTS_COLUMN_NAMES, \
SALARY_COLUMN_NAMES


def player_box_scores(day, month, year, output_type=None, output_file_path=None, output_write_option=None,
Expand Down Expand Up @@ -133,6 +134,27 @@ def players_advanced_season_totals(season_end_year, include_combined_values=Fals
json_options=json_options,
)

def team_salaries(team, season_end_year, output_type=None, output_file_path=None, output_write_option=None,
json_options=None):
try:
http_service = HTTPService(parser=ParserService())
values = http_service.team_salaries(team, season_end_year)
except requests.exceptions.HTTPError as http_error:
if http_error.response.status_code == requests.codes.not_found:
raise InvalidSeason(season_end_year=season_end_year)
else:
raise http_error
return output(
values=values,
output_type=output_type,
output_file_path=output_file_path,
output_write_option=output_write_option,
csv_writer=CSVWriter(
column_names=SALARY_COLUMN_NAMES,
row_formatter=RowFormatter(data_field_names=SALARY_COLUMN_NAMES)
),
json_options=json_options,
)

def team_box_scores(day, month, year, output_type=None, output_file_path=None, output_write_option=None,
json_options=None):
Expand Down
37 changes: 37 additions & 0 deletions basketball_reference_web_scraper/html.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import re

from basketball_reference_web_scraper.utilities import extract_html_obj_in_comment


class BasicBoxScoreRow:
def __init__(self, html):
Expand Down Expand Up @@ -1090,3 +1092,38 @@ def totals_table(self):
return PlayerPageTotalsTable(html=totals_tables[0])

return None

class PlayerSalaryRow:
def __init__(self, html, row_index):
self.html = html
self.index = row_index

@property
def name(self):
return self.html.xpath('//td[@data-stat="player"]')[self.index].text_content()

@property
def salary(self):
salary_td = self.html.xpath('//td[@data-stat="salary"]')[self.index]
return salary_td.get('csk')

class TeamSalaryTable:
def __init__(self, html):
self.html = html

@property
def rows(self):
# basketball-reference does this weird thing where it puts table data in
# comments on the HTML doc which then gets added to the DOM (I'm guessing)
# after a certain amount of time. I assume it is an attempt to make scraping
# more difficult. This is evidenced by the fact that if you attempt to load
# a page on a team with Javascript disabled you will not be able to see all
# the tables. To get around this we just read from the comments.
salary_table = extract_html_obj_in_comment(self.html, '//table[@id="salaries2"]')
header = salary_table.xpath('//tr')[0]
header.getparent().remove(header)
row_tags = salary_table.xpath('//tr//th[@class="center"]')
return [
PlayerSalaryRow(html=row_html, row_index=i)
for i, row_html in enumerate(salary_table.xpath('//tr'))
]
18 changes: 17 additions & 1 deletion basketball_reference_web_scraper/http_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from basketball_reference_web_scraper.errors import InvalidDate, InvalidPlayerAndSeason
from basketball_reference_web_scraper.html import DailyLeadersPage, PlayerSeasonBoxScoresPage, PlayerSeasonTotalTable, \
PlayerAdvancedSeasonTotalsTable, PlayByPlayPage, SchedulePage, BoxScoresPage, DailyBoxScoresPage, SearchPage, \
PlayerPage
PlayerPage, TeamSalaryTable



class HTTPService:
Expand Down Expand Up @@ -98,6 +99,21 @@ def players_season_totals(self, season_end_year):
table = PlayerSeasonTotalTable(html=html.fromstring(response.content))
return self.parser.parse_player_season_totals(totals=table.rows)

def team_salaries(self, team, season_end_year):
url = '{BASE_URL}/teams/{team_abbr}/{end_year}.html'.format(
BASE_URL=HTTPService.BASE_URL,
team_abbr=TEAM_TO_TEAM_ABBREVIATION[team],
end_year=season_end_year
)

response = requests.get(url=url)

response.raise_for_status()

table = TeamSalaryTable(html=html.fromstring(response.content))

return self.parser.parse_team_salary(player_salaries=table.rows)

def schedule_for_month(self, url):
response = requests.get(url=url)

Expand Down
8 changes: 6 additions & 2 deletions basketball_reference_web_scraper/parser_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
SecondsPlayedParser, PlayerBoxScoresParser, PlayerAdvancedSeasonTotalsParser, PeriodDetailsParser, \
PeriodTimestampParser, ScoresParser, PlayByPlaysParser, TeamNameParser, ScheduledStartTimeParser, \
ScheduledGamesParser, PlayerBoxScoreOutcomeParser, PlayerSeasonBoxScoresParser, SearchResultNameParser, \
ResourceLocationParser, SearchResultsParser, LeagueAbbreviationParser, PlayerDataParser
ResourceLocationParser, SearchResultsParser, LeagueAbbreviationParser, PlayerDataParser, TeamSalaryParser


class ParserService:
Expand Down Expand Up @@ -76,6 +76,7 @@ def __init__(self):
league_abbreviation_parser=self.league_abbreviation_parser,
)
self.team_totals_parser = TeamTotalsParser(team_abbreviation_parser=self.team_abbreviation_parser)
self.team_salary_parser = TeamSalaryParser()

def parse_play_by_plays(self, play_by_plays, away_team_name, home_team_name):
return self.play_by_plays_parser.parse(
Expand Down Expand Up @@ -106,4 +107,7 @@ def parse_player_search_results(self, nba_aba_baa_players):
return self.search_results_parser.parse(nba_aba_baa_players=nba_aba_baa_players)

def parse_player_data(self, player):
return self.player_data_parser.parse(player=player)
return self.player_data_parser.parse(player=player)

def parse_team_salary(self, player_salaries):
return self.team_salary_parser.parse(player_salaries=player_salaries)
4 changes: 4 additions & 0 deletions basketball_reference_web_scraper/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,3 +552,7 @@ def parse(self, player):
)
)
}

class TeamSalaryParser:
def parse(self, player_salaries):
return [{'name': row.name, 'salary': int(row.salary)} for row in player_salaries]
9 changes: 9 additions & 0 deletions basketball_reference_web_scraper/utilities.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from lxml import etree, html

def str_to_int(value, default=int(0)):
stripped_value = value.strip()
try:
Expand All @@ -18,3 +20,10 @@ def merge_two_dicts(first, second):
combined = first.copy()
combined.update(second)
return combined

def extract_html_obj_in_comment(html_tree, xpath):
for node in html_tree.iter(etree.Comment):
comment = node.text
extracted_html = html.fromstring(comment)
if extracted_html.xpath(xpath):
return extracted_html
8 changes: 6 additions & 2 deletions basketball_reference_web_scraper/writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
# without doing it this way

SHARED_COLUMN_NAMES = [
"team",
"location",
"team", "location",
"opponent",
"outcome",
"seconds_played",
Expand Down Expand Up @@ -130,6 +129,11 @@
"leagues",
]

SALARY_COLUMN_NAMES = [
"name",
"salary",
]


class WriteOptions:
def __init__(self, file_path=None, mode=None, custom_options=None):
Expand Down