Baseball: Hot and Cold Streaks
- Contributed by Malcolm Hess.
- Malcolm is part of the 12-Week Data Science Bootcamp with Vivian Zhang in the spring of 2015.
- This post is based on Malcolm's project investigating hot and cold streaks for hitters in baseball
- Set up a system where I can easily create my own database of baseball data. I chose a website that has day by day data which has the needed granularity so that I can research hot and cold streaks.
- Create a functions that can that will download the information and format it properly. Then save it to my local computer. This data should contain the career batting data for a specific player in a game by game bat log format.
- Create a function that will load in the data for that player. t.
- Create a visualization of hits across the year to visualization potential patterns in hot and cold streaks.
Results
The chart is a new and interesting way to look at baseball data. It gives you a better feel for the strength of a player in a given time frame. I chose to highlight this year because it is one of this player's worst years in recent times. I was hoping to see a cycle between hot and cold across multiple days but such a cycle isn't really visible. August and September were certainly better than the earlier months of the season but overall I think it is clear that the year is just a poor one for Josh Hamilton's standards. I've looked at multiple players and multiple seasons and have found very little visual evidence to support the existence of a hot and cold cycle throughout the year. I have also not found any evidence to support that certain players have a general and unique yearly cycle. That cycle could be having a week first half of the year with a very strong second half.
I think we can gleam very interesting realizations from the fact that these cycles do not exist. First of all there is no evidence to support that a player's success should cycle with the expected opponent's pitching rotation. We can't assume that players from year to year will be consistent in when they get their hits, whether it is consistent hitting throughout the year or hits clustered together in one or two months. By the end of the year we will still expect a player to have similar results as the previous year but how the player gets there seems to be incredibly random with no determinable cycle.
All code can also be found here: Code
Code
Code Chunk 1: setting up the create a player function-
createplayer <- function(playerfirstname, playerlastname, key=1){
require(XML)
require(RCurl)
#check if key is valid entry
if (class(key)!= "numeric"){
stop("Invalid key: Requires number 1-9")
stop}
if (key > 9 | key < 1){
stop("Invalid key: Requires number 1-9")
stop}
#keys are always two digit, so if less than 10 it makes
#key to a string and adds a zero to the front
if (key < 10){
key <- substring(toString(key), 1, 1)
key <- paste("0", key, sep="")
}
#checks to see if Baseball directory exists, and if not creates it.
if(!file.exists("Baseball")){createwd("Baseball") }
setwd('Baseball')
playerfirstname <- as.character(playerfirstname)
playerlastname <- as.character(playerlastname)
#cleaning names and key to make player identity object
subfirst <- substring(playerfirstname, 1, 2)
sublast <- substring(playerlastname, 1, 5)
identity <- paste(sublast, subfirst, key, sep="")
identity <- tolower(identity)
#checks to see if player already exists in local database
filename <- paste0(identity, ".csv")
if(file.exists(filename)){
stop("Player already exists in database")
}
#making url to get to base page for the specified player
url <- paste0("http://www.baseball-reference.com/players/gl.cgi?id=", identity)
raw <- getURL(url)
data <- htmlParse(raw)
#making a list of all the years that this player has played in
xpath <- '//*[@id="stats_sub_index"]/ul/li[4]/ul/li/a'
nodes <- getNodeSet(data, xpath)
years <- sapply(nodes, xmlValue)
#cleaning up the list of years, need to remove postseason and turn characters to numbers
years <- years[!is.element(years, "Postseason")]
years <- as.numeric(years)
years<-sort(years)
amountofyears <- length(years)
Code Chunk 3: acquiring the batting log data and cleaning of one year data for the specified player
getyeardata <- function(ident = identity, year=2014){
#setting up URL to get data from a specific year
url1<- "http://www.baseball-reference.com/players/gl.cgi?id="
url2<- "&t=b&year="
urlyear <- paste(url1, identity, url2, year, sep="")
#downloading html site and taking out the table with the batting data
html <- htmlTreeParse(urlyear, useInternal=TRUE)
tables <- readHTMLTable(html)
batlog<- tables$batting_gamelogs
rows<- nrow(batlog)
i<-1
while(i<=rows){ #removes Month rows
if(batlog[i,1]=="April" | batlog[i,1]=="May" | batlog[i,1]=="June"|
batlog[i,1]=="July"| batlog[i,1]=="August"| batlog[i,1]=="September"|
batlog[i,1]=="October"){
batlog <- batlog[-i,]
i <- i - 1
rows <- nrow(batlog)
}
i <- i + 1
}
#adding a column to the front of the data that has identity and year on it
#remove first column which is just row number, imported from html table.
batlog <- batlog[,-1]
batlog <- transform(batlog, Player=identity)
batlog <- transform(batlog, Year=year)
temp<- batlog[,36:37]
batlog <- batlog[,-36:-37]
batlog <- cbind(temp,batlog)
#more data cleanup. Var.5 is currently the home/away column, away games signified with @
#Date variable transformed to character so I can clean up the dates and later and a year to it.
rows<- nrow(batlog)
batlog <- transform(batlog, Var.5= as.character(Var.5), Date=as.character(Date), Gtm = as.character(Gtm))
#double headers have extra symbols on them after the date, I need to
#remove (1) or (2) from them to properly transform it to a date class.
#Gtm will also have extra () based on amount of games a player missed
#I am also adding the year to the end of the date.
i <-1
while(i<=rows){
if(grepl(")", batlog$Date[i])) {
nc <- nchar(batlog$Date[i])
batlog$Date[i] <- substring(batlog$Date[i], 1, (nc-4))
}
if(gre)
batlog$Date[i] <- paste(batlog$Date[i], year, sep=", ")
i <- i +1 }
batlog<- subset(batlog, Gcar != "Tm")
batlog<- subset(batlog, H != "HR")
#more data cleaning
colnames(batlog)[7] <- "Home"
batlog <- transform(batlog, DELTAAVG= NA, BA = as.numeric(as.character(BA)), Home = as.character(Home))
#One K loop to do two things. First, deal with issues of home/away, second create deltaavg.
#adds 'H' (symbolize home game) to blank entires, away games are '@' symbol
k<-1
while(k<=nrow(batlog)){
if(batlog[k,7]!= "@"){
batlog[k,7] <- "H"
}
##Makes new variable (deltaavg) that is the difference of Batting average from day to day
if((k+1) <= nrow(batlog)){
batlog$DELTAAVG[(k+1)] <- (batlog$BA[(k+1)] - batlog$BA[k])
}
k<-k+1
} #end of k while loop
batlog
}#end of getyeardata
Code Chunk 4: getting each year for a player and combining it into one data frame. Then saving that data to the local computer.
#initializing object (careerdata) which will become the main dataframe
careerdata <- NULL
j<-1
#Go through all years and rbind the data together into the careerdata object
while (j <= amountofyears){
b<- getyeardata(identity, year = years[j])
careerdata <- rbind(careerdata, b)
j<-j+1
}
filename <- paste0(identity, ".csv")
write.csv(careerdata, file= filename)
} #End of create player function!
Code Chunk 5: creating a function to load in a specific player's data.
loadplayer <- function (playerfirstname, playerlastname, key=1){
setwd("E:/Baseball")
if (key < 10){
key <- substring(toString(key), 1, 1)
key <- paste("0", key, sep="")
}
playerfirstname <- as.character(playerfirstname)
playerlastname <- as.character(playerlastname)
#cleaning names and key to make player identity object
subfirst <- substring(playerfirstname, 1, 2)
sublast <- substring(playerlastname, 1, 5)
identity <- paste(sublast, subfirst, key, sep="")
identity <- tolower(identity)
filename <- paste0(identity, ".csv")
dataframe <- read.csv(filename, header=TRUE)
dataframe
}
Code Chunk 6: visualizing the data
source('calendarheat.R')
currentplayer <- loadplayer("josh", "hamilton", 3)
simple <- transform(currentplayer, date = as.Date(Date, format = "%b %d, %Y"), h= as.numeric(H), Date=as.character(Date))
sub1 <- subset(simple, format(date, "%Y") %in% c("2013"))
calendarHeat(sub1$date, sub1$h, date.form = "%b %d, %Y")