-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnapoli.Rmd
654 lines (555 loc) · 31.9 KB
/
napoli.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
---
title: "Napoli Campioni in Italia"
author: "Emanuele Iaccarino"
date: "`r Sys.Date()`"
output: html_document
---
#Web scraping using rvest
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(rvest)
library(ggplot2)
library(dplyr)
library(cluster) # For clustering algorithms
library(FactoMineR) # For PCA analysis
library(factoextra) # For visualization
url <- "https://fbref.com/it/squadre/d48ad4ff/Statistiche-Napoli#site_menu_link"
page <- read_html(url)
content <- html_text(page)
print(content)
```
```{r}
table <- html_table(page, fill = TRUE)
for (i in 1:length(table)) {
if (!(i %in% c(2, 13, 14))) {
colnames(table[[i]]) <- table[[i]][1, ]
table[[i]] <- table[[i]][-1, ]
}
}
```
```{r}
data <- apply(table[[2]], 2, function(x) as.numeric(gsub(",", ".", x, fixed = TRUE)))
data =as.data.frame(data [-25,])
data $Differenza= data $Rf-data $xG
media <- mean(data $Differenza)
mediana <- median(data $Differenza)
deviazione_standard <- sd(data $Differenza)
cat("Media:", media, "\n")
cat("Mediana:", mediana, "\n")
cat("Deviazione Standard:", deviazione_standard, "\n")
```
expected goal never lie!!
the mean is postive so we confirm the overall overperformance, the standard deviation is not really high so we can say that Naples had almost the trend in all the season, in the study below we will focuse on each period to have a better understanding of the season
#Analysis of the performance of the team
```{r}
#remove italy cup data since it has no data
table[[2]] =as.data.frame(table[[2]] [-25,])
table[[2]]$Rf <- as.numeric(table[[2]]$Rf)
table[[2]]$Data <- as.Date(table[[2]]$Data, format = "%d-%m-%Y")
#graphic between Goal made and expected goal
ggplot(data = table[[2]], aes(x = Data, y = Rf - xG)) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Fatti - xG") +
ggtitle("Differenza Gol Fatti - xG per ogni Partita") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
geom_smooth(method = "lm", se = FALSE, color = "red") #adding regression line
table[[2]]$Rs=as.numeric(table[[2]]$Rs)
#graphic between Goal conceded and expected goal against
ggplot(data = table[[2]], aes(x = Data, y = Rs - xGA)) +
geom_point(color = "red", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Subiti - xGA") +
ggtitle("Differenza Gol Subiti - xGA per ogni Partita") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
geom_smooth(method = "lm", se = FALSE, color = "red")
```
we can see that Naples has an overall good performance about scoring, it has a mean more of less of 0.27 so at the end of the season the team scored more than it has created. It started really well, in fact the intercept is at 0,6 and slope goes down because they won the championship in advance so they became more relaxed in game. Let's analyze for each time of the season how the time performed
###First half of the season and the second half of the season, let's see the difference
```{r}
ggplot(data = table[[2]][1:26,], aes(x = Data, y = Rf - xG)) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Fatti - xG") +
ggtitle("Differenza Gol Fatti - xG per ogni Partita girone andata") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
geom_smooth(method = "lm", se = FALSE, color = "red")
ggplot(data = table[[2]][27:nrow(table[[2]]),], aes(x = Data, y = Rf - xG)) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Fatti - xG") +
ggtitle("Differenza Gol Fatti - xG per ogni Partita girone ritorno") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
geom_smooth(method = "lm", se = FALSE, color = "red")
ggplot(data = table[[2]][1:26,], aes(x = Data, y = Rs - xGA)) +
geom_point(color = "red", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Subiti - xGA") +
ggtitle("Differenza Gol Subiti - xGA per ogni Partita girone andata") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
geom_smooth(method = "lm", se = FALSE, color = "red")
ggplot(data = table[[2]][27:nrow(table[[2]]),], aes(x = Data, y = Rs - xGA)) +
geom_point(color = "red", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Subiti - xGA") +
ggtitle("Differenza Gol Subiti - xGA per ogni Partita girone ritorno") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
geom_smooth(method = "lm", se = FALSE, color = "red")
```
we can see that the first half of the season went pretty well, in fact the Naples lost only two game(Internazionale and Liverpool) and draw two game as well(Fiorentina and Lecce at the start of the season). In the second half of the season the intercept became lower because Napoli lost the unpredictability, teams became more aware of the scheme and key players, the slope goes down because as I said before when they won the Championship they became more relaxed.
About the defense, we can see that at the start of the season the team managed to defend very well, in fact the difference between goal and XAG is negative, while in the second half the defensive performance became worse and the really negative match against AC MILAN influence a lot
###World Cup Analysis, before and after the
```{r}
ggplot(data = table[[2]][1:21,], aes(x = Data, y = Rf - xG)) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Fatti - xG") +
ggtitle("Differenza Gol Fatti - xG per ogni Partita pre mondiale") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
geom_smooth(method = "lm", se = FALSE, color = "red")
ggplot(data = table[[2]][22:nrow(table[[2]]),], aes(x = Data, y = Rf - xG)) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Fatti - xG") +
ggtitle("Differenza Gol Fatti - xG per ogni Partita post mondiale") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
geom_smooth(method = "lm", se = FALSE, color = "red")
ggplot(data = table[[2]][1:21,], aes(x = Data, y = Rs - xGA)) +
geom_point(color = "red", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Subiti - xGA") +
ggtitle("Differenza Gol Subiti - xGA per ogni Partita pre mondiale") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
geom_smooth(method = "lm", se = FALSE, color = "red")
ggplot(data = table[[2]][22:nrow(table[[2]]),], aes(x = Data, y = Rs - xGA)) +
geom_point(color = "red", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Subiti - xGA") +
ggtitle("Differenza Gol Subiti - xGA per ogni Partita post mondiale") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
geom_smooth(method = "lm", se = FALSE, color = "red")
```
While the Serie A was in stand by because the World Cup was playing, people were saying that Naples, after a really good start will be going down. The start was not the best, losing with Inter in a game when the attackers had really big problem to score but they managed to come back in the following games and to make a bigger lead. The match against Juventus was the best game of the season but in the next game Naples was able to win difficult matches with a good overperformance thanks to Victor Osimhen
###AC MILAN 0-4 , before and after
is worth to analyze this time line because in my opion the Championship match with Ac MILAN was the reason for the downfall of the good performance of Naples
```{r}
# Grafico delle differenze tra gol fatti e xG con retta di regressione
ggplot(data = table[[2]][1:35,], aes(x = Data, y = Rf - xG)) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Fatti - xG") +
ggtitle("Differenza Gol Fatti - xG per ogni Partita pre Milan 0-4") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
geom_smooth(method = "lm", se = FALSE, color = "red")
# Grafico delle differenze tra gol fatti e xG con retta di regressione
ggplot(data = table[[2]][36:nrow(table[[2]]),], aes(x = Data, y =Rf -xG)) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Fatti - xG") +
ggtitle("Differenza Gol Fatti - xG per ogni Partita post Milan 0-4") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
geom_smooth(method = "lm", se = FALSE, color = "red")
# Grafico delle differenze tra gol subiti e xGA per tutte le partite
ggplot(data = table[[2]][1:35,], aes(x = Data, y =Rs -xGA)) +
geom_point(color = "red", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Subiti - xGA") +
ggtitle("Differenza Gol Subiti - xGA per ogni Partita pre Milan 0-4") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
geom_smooth(method = "lm", se = FALSE, color = "red")
# Grafico delle differenze tra gol subiti e xGA per tutte le partite
ggplot(data = table[[2]][36:nrow(table[[2]]),], aes(x = Data, y =Rs -xGA)) +
geom_point(color = "red", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Data", y = "Differenza Gol Subiti - xGA") +
ggtitle("Differenza Gol Subiti - xGA per ogni Partita post Milan 0-4") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
geom_smooth(method = "lm", se = FALSE, color = "red")
```
we can see that the intercept after the Milan match became really low, and in the next 10 games the Naples managed to overperforme only in two cases, the same goes for the defensive performance, Napoli conceded a lot more. The analysis above confirm that the match against Milan influnced the rest of the season, is not easy to withstand a 0-4 in your home and than be kicked out of Champions League by the hand of the same team
#Spectators in home game
```{r}
partite_casa <- subset(table[[2]], Stadio == "Casa")
partite_casa$Data <- as.Date(partite_casa$Data)
ggplot(data = partite_casa, aes(x = Data, y = Spettatori)) +
geom_point(aes(color = Competizione), size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90)+
scale_color_manual(values = c("Serie A" = "green", "Champions Lg" = "blue")) +
labs(x = "Data", y = "Numero di Spettatori") +
ggtitle("Andamento Numero di Spettatori nelle Partite Stadio Casa") +
theme_minimal()
```
I wanted to see how many spectators there were in each game, just a curiosity of mine because at the end of the season it was almost impossible to find avaible tickets!
#Ball Possession
```{r}
table[[2]]$Data <- as.Date(table[[2]]$Data)
ggplot(data = table[[2]], aes(x = Data, y = Poss., color = Risultato)) +
geom_point(size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90)+
scale_color_manual(values = c("V" = "green", "P" = "red", "N" = "blue")) +
labs(x = "Data", y = "Possesso palla") +
ggtitle("Andamento del Possesso Palla nel Tempo") +
theme_minimal()
```
One of the main point of the gameplay of our coach Luciano Spalletti is keeping the ball as much as we can, in fact Naples has the biggest ball possession in the whole Serie A and it is one of the best in the top 5 league
###Correlation between ball possession and Winning a game
Are this correlated??Let's check
```{r}
table[[2]]$Poss. <- as.numeric(table[[2]]$Poss.)
ggplot(data = table[[2]], aes(x = Poss., y = Risultato)) +
geom_point(color = "blue", size = 3) +
geom_text(aes(label = Avversario), vjust = -0.5, hjust = 1, angle = 90) +
labs(x = "Possesso palla", y = "Differenza Gol Fatti - Gol Subiti") +
ggtitle("Correlazione tra Possesso Palla e Risultato") +
theme_minimal()
table[[2]]$Risultato_numerico <- ifelse(table[[2]]$Risultato == "V", 1,
ifelse(table[[2]]$Risultato == "N" || table[[2]]$Risultato == "P", -1, 0))
correlation <- cor(table[[2]]$Poss., table[[2]]$Risultato_numerico)
correlation
```
The correlation coefficient of -0.08156555 indicates a very weak negative correlation between ball possession and the result obtained. This value indicates that there is no strong linear relationship between these two variables.
In particular, the negative sign indicates that as ball possession increases, the result obtained tends to be slightly more unfavorable (less frequent victories or more frequent defeats).
These could be understood watching the game of Naples, usually we had problem with small team that tend to park a bus in their area, Naples team work really well in vertical, instead in small space we can only count on Kvara performance or really fast pass to get through the defense. In game when Napoli can't score they tend to keep the ball all the time to find some space to attack while the other team defende in 10 men, that's why when the ball possession gets to high there is a possibility Naples is losing
#Formation
along the season Naples tested two formation
```{r}
frequenza_formazioni <- table(table[[2]]$Formazione)
barplot(frequenza_formazioni, main = "Frequenza delle formazioni", xlab = "Formazione", ylab = "Frequenza")
```
as you can see in the barplot Naples used mostly the 4-3-3,using Zielinski as a midfielder, instead 4-2-3-1 was used when not all the main players were avaible, using Zielinski or Raspadori as a TRQ
```{r}
library(dplyr)
performance_formazione <- table[[2]] %>%
group_by(Formazione) %>%
summarise(media_gol_fatti = mean(Rf),
deviazione_gol_fatti = sd(Rf),
min_gol_fatti = min(Rf),
max_gol_fatti = max(Rf),
media_gol_subiti = mean(Rs),
deviazione_gol_subiti = sd(Rs),
min_gol_subiti = min(Rs),
max_gol_subiti = max(Rs),
media_possesso_palla = mean(Poss.),
deviazione_possesso_palla = sd(Poss.),
min_possesso_palla = min(Poss.),
max_possesso_palla = max(Poss.))
performance_formazione
```
we can see that the performance of the 4-3-3 was better, in fact this was the scheme Spalletti used for the whole season
```{r}
library(ggplot2)
ggplot(data = performance_formazione, aes(x = Formazione, y = media_gol_fatti)) +
geom_bar(stat = "identity", fill = "blue") +
labs(x = "Formazione", y = "Media Gol Fatti") +
ggtitle("Media Gol Fatti per Formazione")
ggplot(data = performance_formazione, aes(x = Formazione, y = media_gol_subiti)) +
geom_bar(stat = "identity", fill = "red") +
labs(x = "Formazione", y = "Media Gol Subiti") +
ggtitle("Media Gol Subiti per Formazione")
ggplot(data = performance_formazione, aes(x = Formazione, y = media_possesso_palla)) +
geom_bar(stat = "identity", fill = "green") +
labs(x = "Formazione", y = "Media Possesso Palla") +
ggtitle("Media Possesso Palla per Formazione")
```
#Shot analysis
```{r}
data <- table[[5]]
playername=data$Giocatore[-(28:29)]
data <- data[, -(1:4)]
data <- data[, -ncol(data)]
data <- apply(data, 2, function(x) as.numeric(gsub(",", ".", x, fixed = TRUE)))
data <- apply(data, 2, function(x) replace(x, is.na(x), 0))
correlation_matrix <- as.data.frame(cor(data))
print(correlation_matrix)
```
I checked the correlation between each variable about shooting to see if there is something worth to analyze. Actually the result are really predictable so we don't need any further analysis, the only thing interesting is that the correlation between the mean distance of shooting and the goal kick taken was lower than i thought. The designed player to take goal kick are Zielinski and Politano that usually shot from a distance or at the end of the penalty area. i think the reason for the low correlation is that sometimes Victor Osimhen take goal kick, with no great results, but since he is the player that shot from the shorter distance he is the main reason of why there is no correlation
###Cluster
```{r}
independent_vars <- c("Tiri","% TiP", "Tiri/90", "TiP/90", "G/Tiri", "G/TiP", "Dist.", "Pun.", "Rigori", "Rig T", "xG", "npxG", "npxG/Sh", "G-xG", "np:G-xG")
data=as.data.frame(data)
data <- data.frame(Reti = data$Reti, data[, independent_vars])
data <- data[-(28:29),]
data <- apply(data, 2, as.numeric)
standardized_data <- scale(data)
k_values <- 2:10 # Range of k values to evaluate
wss <- vector("numeric", length = length(k_values)) # Vector to store within-cluster sum of squares
for (i in 1:length(k_values)) {
kmeans_result <- kmeans(standardized_data, centers = k_values[i])
wss[i] <- kmeans_result$tot.withinss
}
plot(k_values, wss, type = "b", pch = 19, frame = FALSE, xlab = "Number of Clusters (k)", ylab = "Within-Cluster Sum of Squares (WSS)", main = "Elbow Method")
elbow_index <- which(diff(wss) < mean(diff(wss))) # Find the index where the rate of change drops significantly
optimal_k <- k_values[elbow_index] # Select the corresponding k value
cat("Optimal number of clusters:", optimal_k, "\n")
optimal_k <- 3
kmeans_result <- kmeans(standardized_data, centers = optimal_k)
cluster_assignments <- kmeans_result$cluster
data_with_clusters <- bind_cols(data, Cluster = cluster_assignments)
independent_vars <- c("Tiri","X..TiP", "Tiri.90", "TiP.90", "G.Tiri", "G.TiP", "Dist.", "Pun.", "Rigori", "Rig.T", "xG", "npxG", "npxG.Sh", "G.xG", "np.G.xG")
for (var in independent_vars) {
plot_data <- bind_cols(data_with_clusters, Variable = data[, var])
plot_data$Player <- playername
plot <- ggplot(plot_data, aes(x = Reti, y = Variable, color = as.factor(Cluster), label = Player)) +
geom_point(alpha = 0.5) +
geom_text(vjust = -1) + # Adjust the position of the player name labels
labs(title = paste("Clustering Analysis for", var),
x = "Reti",
y = var,
color = "Cluster") +
theme_minimal()
print(plot)
}
```
the cluster analysis divide our player in 3 group, the first one has the defend with low partecipation on attacking, the second cluster is really big and have the majority of the player, instead the third one have our top player Kvara and Osimhen. I plotted the most important variable regarding the shooting
###PCA
```{r}
data_pca <- data[, independent_vars]
pca_result <- PCA(data_pca, graph = FALSE)
fviz_eig(pca_result, addlabels = TRUE)
biplot <- fviz_pca_biplot(pca_result, label = "var", repel = TRUE)
biplot <- biplot + geom_text(aes(label = playername), hjust = 0.5, vjust = -0.5, size = 3, color = "black")
print(biplot)
```
the pca analysis synthetize all the plot we saw before in only one graphic, we can see how Zanoli and Gaetano as subbed in try to seize the opportunity, on the -,- part of the cartesian plan there are the player not present in goal zone, instead obv we have Osimhen and Kvaratskhelia with the best stats. Worth to mention Elmas and Politano that had a good overperformance and Simeone and Raspadori that as you can see have really good stats as subbed in, from the plot we can see that they know were to stand, waiting for a goal oppportunity, and when it comes they don't miss it, in fact they scored 2/3 goal each worth 3 point, deciding a match on the last minute
#Passing analysis
i made the same correlation matrix for the passes
```{r}
data2= table[[6]]
data2 <- data2[, -(2:4)]
data2 <- data2[, -ncol(data2)]
data2 =as.data.frame(data2[-(28:29),])
playername=data2$Giocatore
data2 <- apply(data2, 2, as.numeric)
data2 <- apply(data2, 2, function(x) replace(x, is.na(x), 0))
data2=as.data.frame(data2)
completion_rate <- data2$Compl. / data2$`Tent,`
hist(completion_rate, breaks = 10, col = "lightblue", main = "Completion Rate Distribution")
# Player Performance Analysis
player_performance <- data.frame(Player = playername, CompletionRate = completion_rate)
# Top 5 players with highest completion rates
top_players <- player_performance[order(player_performance$CompletionRate, decreasing = TRUE), ][1:5, ]
print(top_players)
```
i put a lot of interest in the completation rate of passing, because in our team is a main factor and that's the reason Lobotka is one of the key players of the team
```{r}
player_performance <- data.frame(Player = playername, CompletionRate = completion_rate,dist_tot=data2$`Dist. Tot.`, completions=data2$Compl., XAG=data2$xAG)
boxplot(CompletionRate ~ Player, data = player_performance,
main = "Completion Rate by Player", xlab = "Player", ylab = "Completion Rate",
las = 2)
unique_players <- unique(player_performance$Player)
player_positions <- 1:length(unique_players)
text(player_positions, par("usr")[3] - 0.1, labels = unique_players, srt = 90, adj = c(1, 0.5), xpd = TRUE)
```
i used a boxplot to plot all the information in one graphic
###Cluster
```{r}
independent_vars <- c("PrgP", "Dist. Tot.", "Dist. Prog.","xAG","PF","PPA","Cross in area")
data <- data.frame(CompletionRate = completion_rate, data2[, independent_vars])
data <- data[-(28:29),]
data <- apply(data, 2, as.numeric)
standardized_data <- scale(data)
k_values <- 2:10
wss <- vector("numeric", length = length(k_values))
for (i in 1:length(k_values)) {
kmeans_result <- kmeans(standardized_data, centers = k_values[i])
wss[i] <- kmeans_result$tot.withinss
}
plot(k_values, wss, type = "b", pch = 19, frame = FALSE, xlab = "Number of Clusters (k)", ylab = "Within-Cluster Sum of Squares (WSS)", main = "Elbow Method")
elbow_index <- which(diff(wss) < mean(diff(wss))) # Find the index where the rate of
optimal_k <- k_values[elbow_index]
cat("Optimal number of clusters:", optimal_k, "\n")
optimal_k <- 3
kmeans_result <- kmeans(standardized_data, centers = optimal_k)
cluster_assignments <- kmeans_result$cluster
data_with_clusters <- bind_cols(data, Cluster = cluster_assignments)
independent_vars <- c("PrgP", "Dist..Tot.", "Dist..Prog.","xAG","PF","PPA","Cross.in.area")
for (var in independent_vars) {
plot_data <- bind_cols(data_with_clusters, Variable = data[, var])
plot_data$Player <- playername
plot <- ggplot(plot_data, aes(x = CompletionRate, y = Variable, color = as.factor(Cluster), label = Player)) +
geom_point(alpha = 0.5) +
geom_text(vjust = -1) +
labs(title = paste("Clustering Analysis for", var),
x = "Completion Rate",
y = var,
color = "Cluster") +
theme_minimal()
print(plot)
}
```
this cluster give us a clear differentation: the firsr cluster is about player who pass a lot to set up the game rather than to assist, for instance DC, CMD and Oliveira that as a fullback is the one that is more focused on defend.
The second cluster is about the player that don't pass a lot, let's say they are more incline to receive rather than to pass or they are not part of the main players. The last cluster is about fullback, wings and TRQ: so player that cross a lot and give assist to our forward
I plotted the most important variable regarding the passing
###Passing Distribution
```{r}
data_contrasts <- data2[, c("Compl.", "Compl.", "Compl.")]
data_contrasts$Giocatore <- playername
data_contrasts_long <- reshape2::melt(data_contrasts, id.vars = "Giocatore")
variable <- c("pass brevi", "pass medi", "pass lunghi")
contrasts_plot <- ggplot(data_contrasts_long, aes(x = Giocatore, y = value, fill = variable)) +
geom_bar(stat = "identity") +
labs(title = "Passing Distribution",
x = "Players",
y = "Value") +
scale_fill_manual(values = c("darkgreen", "darkblue", "darkorange"),
labels = c("pass brevi", "pass medi", "pass lunghi")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
axis.title.y = element_blank())
print(contrasts_plot)
```
just to visualize what type of pass each player mostly uses
###PCA
```{r}
data_pca <- data[, independent_vars]
pca_result <- PCA(data_pca, graph = FALSE)
fviz_eig(pca_result, addlabels = TRUE)
biplot <- fviz_pca_biplot(pca_result, label = "var", repel = TRUE)
biplot <- biplot + geom_text(aes(label = playername), hjust = 0.5, vjust = -0.5, size = 3, color = "black")
print(biplot)
```
In the PCA analysis the differentation is more clear, on the +,+ side we have the playmakers of the team, with the majority of passes and progressiv passes, instead in the lower quadrant we have the player that create the majority of goal chance, that include pass inside the penalty area the pass who lead to a shot
###Type of pass
```{r}
data=table[[7]]
data <- apply(data, 2, as.numeric)
data=as.data.frame(data[-(28:29),])
data=data[,-(2:4)]
data=data[,-(18)]
variables <- c( "Pun.", "PassFil", "Scambi", "Cross", "Rimesse in gioco", "Angoli", "Conv.", "Div.", "Dir.", "Compl.", "Fuorigioco", "Blocchi")
data$Giocatore=playername
for (variable in variables) {
variable_data <- subset(data, select = c("Giocatore", variable))
pass_data_long <- reshape2::melt(variable_data, id.vars = "Giocatore")
pass_plot <- ggplot(pass_data_long, aes(x = Giocatore, y = value)) +
geom_bar(stat = "identity", width = 0.5) +
labs(title = paste("Type of pass =", variable),
x = "Players",
y = "Pass Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Rotate x-axis labels if needed
print(pass_plot)
}
```
i just plotted the stats for type of passes for each player for a better visualizzation
```{r}
correlation_matrix <- as.data.frame(cor(data[,-(1)]))
print(correlation_matrix)
```
seeing the corr matrix i didn't find anything worth to analyze
#Creation of goal or shot opportunities
```{r}
variables <- c("SCA", "SCA90", "PassaggiInGioco", "PassaggiNonInGioco", "A", "Tiri", "Falli", "Def.", "GCA", "GCA90")
data=table[[8]]
data <- apply(data, 2, function(x) as.numeric(gsub(",", ".", x, fixed = TRUE)))
data=as.data.frame(data[-(28:29),])
data=data[,-(1:4)]
data=data[,-(22)]
data_pca <- data[, variables]
pca_result <- PCA(data_pca, graph = FALSE)
fviz_eig(pca_result, addlabels = TRUE)
biplot <- fviz_pca_biplot(pca_result, label = "var", repel = TRUE)
biplot <- biplot + geom_text(aes(label = playername), hjust = 0.5, vjust = -0.5, size = 3, color = "black")
print(biplot)
```
The Scree plot shows the variance explained by each principal component, allowing you to determine how many principal components to consider
The biplot displays the observations (players) and inplane variables of the first two principal components, allowing to identify potential patterns or clusters.
We can see how kvaratskelia is the key player of the team, the one that create most goal or shot opportunities
#Defensive action
```{r}
variables <- c("Cntrs", "Cntrs.1", "Blocchi", "Int", "Salvat.")
data=table[[9]]
data <- apply(data, 2, function(x) as.numeric(gsub(",", ".", x, fixed = TRUE)))
data=as.data.frame(data[-(28:29),])
data=data[,-(1:4)]
data=data[,-(22)]
data_pca <- data[, variables]
pca_result <- PCA(data_pca, graph = FALSE)
fviz_eig(pca_result, addlabels = TRUE)
biplot <- fviz_pca_biplot(pca_result, label = "var", repel = TRUE)
biplot <- biplot + geom_text(aes(label = playername), hjust = 0.5, vjust = -0.5, size = 3, color = "black")
print(biplot)
```
from this plot we can see how our rearguard works, we have the CDM that have the job to intercept any loose ball and to stop the other team dribblers. From our fullback we can see how Di Lorenzo is more likely to incercept the ball to go for a counterattack, instead as a TS even if Mario Rui played more, Oliveira has better defensive stats(they are not for 90 min) that means that means that the defensive contribution of Mario is really poor. Worth to mention Kvaraskheila that recovered a lot of ball in the last quarter of the field and Lozano: even if his scoring contribution was insufficient this year he made a lot of dirty work, sometimes acting as a fullback when Di Lorenzo was attacking and doing a great job in intercepting ball, for instance we score against Frankufurt and Cremonese thanks to the ball he recovered when doing high intensity pressing and lead us to score important goals
###Distribution of were contrast happen
```{r}
data_contrasts <- data[, c("Treq. dif.", "Treq. cen.", "Treq. off.")]
data_contrasts$Giocatore <- playername
data_contrasts_long <- reshape2::melt(data_contrasts, id.vars = "Giocatore")
contrasts_plot <- ggplot(data_contrasts_long, aes(x = Giocatore, y = value, fill = variable)) +
geom_bar(stat = "identity") +
labs(title = "Contrasts Distribution",
x = "Players",
y = "Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
axis.title.y = element_blank())
print(contrasts_plot)
```
#Ball Controll
```{r}
data=table[[10]]
data <- apply(data, 2, function(x) as.numeric(gsub(",", ".", x, fixed = TRUE)))
data=as.data.frame(data[-(28:29),])
data=data[,-(1:4)]
data=data[,-(25)]
variables <- c("Tocchi", "Att", "Tkld", "Controlli palla", "Dist. Tot.","Dist. Prog.","Ric.")
data_pca <- data[, variables]
pca_result <- PCA(data_pca, graph = FALSE)
fviz_eig(pca_result, addlabels = TRUE)
biplot <- fviz_pca_biplot(pca_result, label = "var", repel = TRUE)
biplot <- biplot + geom_text(aes(label = playername), hjust = 0.5, vjust = -0.5, size = 3, color = "black")
print(biplot)
```
the ball controll analysis using PCA give us a full understanding of the study using only 2 dimensions. Kvara is by far the best about taking possession of the ball. on the +,- quadrant we find the player who are at the center of Napoli's ball possession, instead Anguissa exceed in both so he is put in the middle
###Distribution of touch according to the area of the field
```{r}
data_contrasts <- data[, c("Area dif.", "Treq. dif.", "Treq. cen.","Treq. off.","Area off.")]
data_contrasts$Giocatore <- playername
data_contrasts_long <- reshape2::melt(data_contrasts, id.vars = "Giocatore")
contrasts_plot <- ggplot(data_contrasts_long, aes(x = Giocatore, y = value, fill = variable)) +
geom_bar(stat = "identity") +
labs(title = "Contrasts Distribution",
x = "Players",
y = "Value") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
axis.title.y = element_blank())
print(contrasts_plot)
```
#Player with most PAP(point per match) with at least 10 games as a starter
```{r}
data=table[[11]]
data=data[,1:16]
data=as.data.frame(data[-(17:32),])
data$Tit <- as.numeric(data$Tit)
data_filtered <- data %>%
filter(Tit >= 10)
data_sorted <- data_filtered %>%
arrange(desc(PAP))
top_5_players <- data_sorted %>%
top_n(5, PAP) %>%
select(c(1, 10, 16))
print(top_5_players)
```
Mario rui is our lucky charm
#Relationship between Fouls and Yellow Card
```{r}
data=table[[12]]
data=data[,-10]
data=as.data.frame(data[-(28:29),])
data$Falli=as.numeric(data$Falli)
data$Amm.=as.numeric(data$Amm.)
scatter_plot <- ggplot(data, aes(x = Falli, y = Amm., label = Giocatore)) +
geom_point() +
geom_text(hjust = -0.2, vjust = -0.2) +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Add regression line
labs(title = "Relationship between Fouls and Yellow Cards",
x = "Fouls",
y = "Yellow Cards") +
theme_minimal()
print(scatter_plot)
correlation <- cor(data$Falli, data$Amm.)
# Print the correlation coefficient
print(correlation)
```