The data science job market is rapidly evolving, and understanding past trends is crucial for predicting future demands and preparing for upcoming opportunities. Even though the data is from 2021, this provides a historical benchmark that can help identify similar trends, shifts in the job market, and emerging skills in the present time.
The dataset used for this project can be retrieved from here
To run this project, you need to have the following installed:
pip install numpy pandas matplotlib seaborn plotly jupyter
- Python (> = 3.9)
- Jupyter Notebook
- NumPy (> = 1.19.5)
- Pandas (> = 2.2.2)
- MatplotLib (> = 3.8.2)
- Seaborn (> = 0.13.2)
- Plotly (> = 5.22.0)
- Data Cleaning
- Exploratory Data Analysis
- Data Visualization
These visualizations provide insights into the data science job market, highlighting key skills and their prevalence.
The actual Plotly graph did not render on Github so I will provide a screenshot of the visualizations.
# To create subplots with 2 rows and 1 column
fig = make_subplots(rows=2, cols=1)
# Get required data for melt function
data = df[['python', 'excel', 'hadoop', 'spark',
'tableau', 'aws', 'big_data', 'machine_learning']]
# Melt function: Unpivots a DataFrame from a wide format to a long format
# Converts DataFrame into long DataFrame with two columns: Skill and Presence
data_melted = data.melt(var_name='Skill', value_name='Presence')
# Grouping data and resetting index
prob_data = data_melted.groupby(['Skill', 'Presence']).size().reset_index(name = 'Count')
# divide the total of Counts by the number of skill req
prob_data['Probability'] = prob_data['Count'] / len(data)
conditions = [
{'presence': 1, 'prob_color': 'purple', 'count_color': 'red', 'prob_name': 'Probability - yes (1)', 'count_name': 'Count - yes (1)'},
{'presence': 0, 'prob_color': 'green', 'count_color': 'blue', 'prob_name': 'Probability - no (0)', 'count_name': 'Count - no (0)'}
]
for cond in conditions:
# Subplot #1
fig.add_trace(
go.Bar(x=prob_data.query(f'Presence == {cond["presence"]}')['Skill'],
y=prob_data.query(f'Presence == {cond["presence"]}')['Probability'],
name=cond['prob_name'],
marker_color=cond['prob_color']
),
row=1, col=1
)
fig.add_trace(
go.Bar(x=prob_data.query(f'Presence == {cond["presence"]}')['Skill'],
y=prob_data.query(f'Presence == {cond["presence"]}')['Count'],
name=cond['count_name'],
marker_color=cond['count_color']
),
row=2, col=1
)
fig.update_layout(height=500,
width=1000)
fig.update_yaxes(title_text='Probability', row=1, col=1)
fig.update_yaxes(title_text='Count', row=2, col=1)
fig.show()
# To create subplots with 2 rows and 1 column
fig = make_subplots(rows=2, cols=1)
# Get required data for melt function
data = df[['python', 'excel', 'hadoop', 'spark',
'tableau', 'aws', 'big_data', 'machine_learning']]
# Melt function: Unpivots a DataFrame from a wide format to a long format
# Converts DataFrame into long DataFrame with two columns: Skill and Presence
data_melted = data.melt(var_name='Skill', value_name='Presence')
# Grouping data and resetting index
prob_data = data_melted.groupby(['Skill', 'Presence']).size().reset_index(name = 'Count')
# divide the total of Counts by the number of skill req
prob_data['Probability'] = prob_data['Count'] / len(data)
conditions = [
{'presence': 1, 'prob_color': 'purple', 'count_color': 'red', 'prob_name': 'Probability - yes (1)', 'count_name': 'Count - yes (1)'},
{'presence': 0, 'prob_color': 'green', 'count_color': 'blue', 'prob_name': 'Probability - no (0)', 'count_name': 'Count - no (0)'}
]
for cond in conditions:
# Subplot #1
fig.add_trace(
go.Bar(x=prob_data.query(f'Presence == {cond["presence"]}')['Skill'],
y=prob_data.query(f'Presence == {cond["presence"]}')['Probability'],
name=cond['prob_name'],
marker_color=cond['prob_color']
),
row=1, col=1
)
fig.add_trace(
go.Bar(x=prob_data.query(f'Presence == {cond["presence"]}')['Skill'],
y=prob_data.query(f'Presence == {cond["presence"]}')['Count'],
name=cond['count_name'],
marker_color=cond['count_color']
),
row=2, col=1
)
fig.update_layout(height=500,
width=1000)
fig.update_yaxes(title_text='Probability', row=1, col=1)
fig.update_yaxes(title_text='Count', row=2, col=1)
fig.show()
# Take the correlation coefficients between each pair of skills
correl = df[['python', 'excel', 'hadoop', 'spark',
'tableau', 'aws', 'big_data', 'machine_learning']].corr()
# Get row and columns of alues of correlation
leng = [(i, j) for i in range(len(correl)) for j in range(len(correl))]
annotations = [
# Create layout
go.layout.Annotation(
x = correl.columns[j],
y = correl.columns[i],
text=str(round(correl.iloc[i,j], 2)),
showarrow=False,
font=dict(size=12, color="white")
)
for i, j in leng
]
# Create heatmap with the values
fig = go.Figure(data=go.Heatmap(
z=correl.values,
x=correl.columns,
y=correl.columns
))
# Adjust heatmap
fig.update_layout(height=500,
width=1000,
annotations=annotations)
fig.show()