DS6306: Cast Study 01

# Introduction #########
#
#
# The following is an analysis of one-hundred styles of beer brewed in the United States for the executive team,
# CEO and CFO at Budweiser. Budweiser is interested in exploring the how many breweries are in the United States, 
# how each beer is reported in terms of its International Bitterness Unit and Alcohol By Content and basic summary
# statistics and conclusions we are able to uncover with the beer data provided. Statistics will include handling
# missing data and explaining why it was possibly not included in the initial dataset, as well as uncover median 
# and maximum (IBU and ABV) ratings by state. Conclusions will include basic summary statics on the ABV variable, 
# any relationship between the IBU and ABV variables (such as dependencies, e.g. does a higher IBU result in a 
# higher ABV) and finally we will look to see if we can determine general beer styles (Ales and IPAs) based on 
# ABV and IBU values. Additionally, we will report on any findings that are discovered during the analysis.
#
######################
#                    #
#     Libraries      #
#                    #
######################
######################
library(usmap)
library(ggplot2)
library(magrittr)
library(ggplot2)
library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(readr)
library(tibble)
library(tidyverse)

## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ purrr   0.3.4     ✓ forcats 0.5.0
## ✓ dplyr   1.0.2

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::extract()   masks magrittr::extract()
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()

library(robustbase)
library(class)
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(e1071)
library(dplyr)
library(codebook)
library(future)

## 
## Attaching package: 'future'

## The following object is masked from 'package:caret':
## 
##     cluster

library(fmsb)

## Registered S3 methods overwritten by 'fmsb':
##   method    from
##   print.roc pROC
##   plot.roc  pROC

library(ggraph)
library(igraph)

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:future':
## 
##     %->%, %<-%

## The following object is masked from 'package:class':
## 
##     knn

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:purrr':
## 
##     compose, simplify

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following object is masked from 'package:tibble':
## 
##     as_data_frame

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(RColorBrewer)
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

######################
######################
#                    #
#        Data        #
#                    #
######################
########################################################
#read in brewery data
breweryDat <- read.csv("breweries.csv")
breweryDat$State <- trimws(breweryDat$State)
#datafile to organize states into census regions
regionData <- read.csv("geocodes.csv")
regionData <- regionData[,-c(1,2)]
regionData <- dplyr::rename(regionData, "FIPS"="State..FIPS.", "Region" = "Region.1", "Division" = "Division.1")
regionData$State <- trimws(regionData$State)
#Ensure structure of data is compliant 
#head(breweryDat)
#read in beer data
beerDat <- read.csv("beers.csv")
#Loop to fix leading decimal places on ABV
i <- 1
count <- length(beerDat$Name)
for (i in 1:count) {
 if(is.na(beerDat[i,3])){
  beerDat[i,3]=0
   }
   if(beerDat[i,3]<1){
    beerDat[i,3] <- beerDat[i,3]*100
  }
}
#Ensure structure of data is compliant 
#head(beerDat)
########################################################

#Instructions You can assume that your audience is the CEO and CFO of Budweiser (your client) and that they only have had one class in statistics and have indicated that you cannot take more than 7 minutes of their time. 20% of your grade will be based on the presentation.

They have hired you to answer the 7 questions listed below and beyond those general questions you may speculate / anticipate what may be of interest to them

######################
#                    #
#     Question 1     #
#                    #
######################
########################################################
#How many breweries are present in each state?
########################################################
#########################################################
#Use Dplyr to group breweries by state
brewByState <- breweryDat %>% 
  group_by(State) %>% 
  dplyr::count()

brewByState

#########################################################
#########################################################
#Add breweries by state to state information dataframe
statepop$brewByState <- brewByState$n
View(brewByState)
#########################################################
#########################################################
#Fix mismatched state brewery count to state info df
statepop[1,5] <- 3
statepop[2,5] <- 7
statepop[3,5] <- 11
statepop[4,5] <- 2
statepop[8,5] <- 2
statepop[9,5] <- 1
statepop[14,5] <- 18
statepop[15,5] <- 22
statepop[16,5] <- 5
statepop[20,5] <- 9
statepop[22,5] <- 23
statepop[25,5] <- 2
statepop[26,5] <- 9
statepop[28,5] <- 5
statepop[29,5] <- 2
statepop[30,5] <- 3
statepop[32,5] <- 4
statepop[34,5] <- 19
statepop[33,5] <- 16
statepop[35,5] <- 1
statepop[45,5] <- 4
statepop[46,5] <- 10
statepop[47,5] <- 16
statepop[49,5] <- 1
statepop[50,5] <- 20
#Check data 
#View(statepop)
#View(brewByState)
#########################################################
#########################################################
#Call plot functions to plot state brewery count on USmap
nationBrewPlot <- plot_usmap(data = statepop, values = "brewByState",labels=TRUE, color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma) + theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
#display plot
nationBrewPlot

## Warning: Use of `map_df$x` is discouraged. Use `x` instead.

## Warning: Use of `map_df$y` is discouraged. Use `y` instead.

## Warning: Use of `map_df$group` is discouraged. Use `group` instead.

## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.

## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.

## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, NE first
NEplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .new_england,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
NEplot

## Warning: Use of `map_df$x` is discouraged. Use `x` instead.

## Warning: Use of `map_df$y` is discouraged. Use `y` instead.

## Warning: Use of `map_df$group` is discouraged. Use `group` instead.

## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.

## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.

## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, Mid Atlantic second
MAplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .mid_atlantic,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
MAplot

## Warning: Use of `map_df$x` is discouraged. Use `x` instead.

## Warning: Use of `map_df$y` is discouraged. Use `y` instead.

## Warning: Use of `map_df$group` is discouraged. Use `group` instead.

## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.

## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.

## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, East North Central third
ENCplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .east_north_central,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
ENCplot

## Warning: Use of `map_df$x` is discouraged. Use `x` instead.

## Warning: Use of `map_df$y` is discouraged. Use `y` instead.

## Warning: Use of `map_df$group` is discouraged. Use `group` instead.

## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.

## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.

## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, West North Central fourth
WNCplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .west_north_central,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
WNCplot

## Warning: Use of `map_df$x` is discouraged. Use `x` instead.

## Warning: Use of `map_df$y` is discouraged. Use `y` instead.

## Warning: Use of `map_df$group` is discouraged. Use `group` instead.

## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.

## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.

## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, South Atlantic fifth
SAplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .south_atlantic,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
SAplot

## Warning: Use of `map_df$x` is discouraged. Use `x` instead.

## Warning: Use of `map_df$y` is discouraged. Use `y` instead.

## Warning: Use of `map_df$group` is discouraged. Use `group` instead.

## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.

## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.

## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, East South Central sixth
ESCplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .east_south_central,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
ESCplot

## Warning: Use of `map_df$x` is discouraged. Use `x` instead.

## Warning: Use of `map_df$y` is discouraged. Use `y` instead.

## Warning: Use of `map_df$group` is discouraged. Use `group` instead.

## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.

## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.

## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, West South Central seventh
WSCplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .west_south_central,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
WSCplot

## Warning: Use of `map_df$x` is discouraged. Use `x` instead.

## Warning: Use of `map_df$y` is discouraged. Use `y` instead.

## Warning: Use of `map_df$group` is discouraged. Use `group` instead.

## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.

## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.

## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, Mountain eighth 
Mplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .mountain,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
Mplot

## Warning: Use of `map_df$x` is discouraged. Use `x` instead.

## Warning: Use of `map_df$y` is discouraged. Use `y` instead.

## Warning: Use of `map_df$group` is discouraged. Use `group` instead.

## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.

## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.

## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################
#########################################################
#Break down by region, Pacific ninth 
Pplot <- plot_usmap(data=statepop, values = "brewByState",labels = TRUE,include = .pacific,color = "grey73") + scale_fill_continuous(low = "purple", high = "green", name = "Brewery Count", label = scales::comma)+ theme(legend.position = "bottom")+labs(title = "Total Brewery Count Per State")
Pplot

## Warning: Use of `map_df$x` is discouraged. Use `x` instead.

## Warning: Use of `map_df$y` is discouraged. Use `y` instead.

## Warning: Use of `map_df$group` is discouraged. Use `group` instead.

## Warning: Use of `centroid_labels$x` is discouraged. Use `x` instead.

## Warning: Use of `centroid_labels$y` is discouraged. Use `y` instead.

## Warning: Use of `centroid_labels$abbr` is discouraged. Use `abbr` instead.

#########################################################

######################
#                    #
#     Question 2     #
#                    #
######################
#############################################################################################################
#Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file.  (RMD only, this does not need to be included in the presentation or the deck.)
#############################################################################################################
#############################################################################################################
#
# We merged the breweries.csv dataset with the beers.csv dataset, additionally when we imported
# the individual datasets, we also imported a dataset that allows us to associate each beer with 
# its brewery's US Census Division.  
#
#############################################################################################################
#Use Dplyr package to merge the two tables together
buzzbrews <- merge(breweryDat, beerDat, by.x = "Brew_ID", by.y = "Brewery_id", all = TRUE )
#Use Dplyr package to rename "Name.x" to "Brewery" and "Name.y" to "Beer"
buzzbrews <- dplyr::rename(buzzbrews, "Brewery" = "Name.x","Beer"="Name.y")
#Check the results
#View(buzzbrews)

######################
#                    #
#     Question 3     #
#                    #
######################
###########################################
# Question 3 - Address the missing values in each column.
#
# During the initial exploratory process we discovered NA's in both the IBU and ABV columns. 
# Upon further investigation we determined that some styles of beer, mixed or barrel aged beers
# do not have an ABV available at the time the brewery submits packaging labels to TTB, or Alcohol
# and Tobacco Tax and Trade Bureau. The TTB is the federal agency that determines what can and cannot
# be put on a beer label including the art, type size, verbiage, where elements are placed and etc.
# So beers without an AVB available either do not inlcude it, or add it to the bottom of the cans 
# or packaging at a later date. 
#
# In terms of the missing IBU values, we determined that even though the IBU alludes to the bitterness
# of a beer's taste, it is somewhat misleading because it is derived from a test that measures different
# chemical compounds that are known to cause bitter flavoring. For instance, a beer may have a high IBU 
# value, but due to other ingredients, such as added lactose or sucrose may actually have a sweeter taste 
# than would be expected from a high IBU. The other comfounding variable is if the brewery can afford the 
# equipment used to generate an IBU value, smaller breweries simply cannot afford it while the larger 
# breweries typically just use IBU as a quality control measure.
#
# Finally, we concluded that imputing data or filling in the missing gaps was a good idea for this
# analysis and that was done by taking an average of from similiar styles of beer and assigning that to
# beers in the same sytle classification that did not have values. Upon random testing of different imputed
# values, by googling beers that had missing values in the dataset and comparing that to the created averages,
# it was determined that the imputed values were very close to the actual values in the marketplace.
#
###########################################
#Loop to fix numbering for Column 1 "brew ID"
iterations <- length(buzzbrews$Brew_ID)
for (i in 1:iterations) {
  buzzbrews[i,1]=i
}
#Fix no style beers to none
levels(buzzbrews$Style) <- c(levels(buzzbrews$Style), "none")
for (i in 1:iterations) {
  if(is.na(buzzbrews[i,9])){
  }
}
for (i in 1:iterations) {
  if((buzzbrews[i,9])==''){
    #print(buzzbrews[i,9])
    buzzbrews[i,9]="none"
  }
}
#Prep new df to contain style and averages
buzzbrews$Style <- as.factor(buzzbrews$Style)
#Create a data frame with each style and a variable for average IBU
styleCount <- as.data.frame(levels(buzzbrews$Style))
styleCount$`levels(buzzbrews$Style)` <- as.character(styleCount$`levels(buzzbrews$Style)`)
#View(styleCount)
#Initialize mean ibu to zero (to avoid problems with N/As)
styleCount$meanIbu <- 0
#Make beer count to keep track of total in each style
styleCount$beerCount <- 0
#Make column for total ibus
styleCount$totalIBU <- 0
styleCount$meanABV <- 0
styleCount$ABVbeerCount <- 0
styleCount$totalABV <- 0
#Checking
#View(styleCount)
#styleCount <- styleCount[-c(1), ]
#View(styleCount)
#Calculate mean IBU for each category and store it in IBU df
#Calculate average IBU for each style and add it to df
#outer loop for all the beers
ibuSum <- 0
beerCount <- 0
i <- 1
for (i in 1:iterations) {
  if(is.na(buzzbrews[i,8])) {
    buzzbrews[i,8]=0
  }
  
  #inner for each style
  for (j in 1:100) {
      
    if(buzzbrews[i,9]==styleCount[j,1]){
     #Compute IBU sum
     styleCount[j,4] <- styleCount[j,4]+buzzbrews[i,8]
     
     
     
     #Total of each beer count
     styleCount[j,3] <- styleCount[j,3]+1
     
     if(buzzbrews[i,8]==0){
       styleCount[j,3] <- styleCount[j,3]-1
     }
    
     
     
    }
    #Mean IBU for each style
    styleCount[j,2] <- styleCount[j,4]/styleCount[j,3]
    }}
#Add average column from style count to buzzbrews df
for (i in 1:iterations) {
  if(buzzbrews[i,8]==0){
    for(j in 1:100){
      if(buzzbrews[i,9]==styleCount[j,1]){
        buzzbrews[i,8]=styleCount[j,2]
      }
    }
  }
}
# View(styleCount)
# View(buzzbrews)
# Now do it all again for ABV
# Calculate average ABV for each style and add it to df
# outer loop for all the beers
AlcSum <- 0
AlcVeerCount <- 0
i <- 1
for (i in 1:iterations) {
  if(is.na(buzzbrews[i,7])) {
   buzzbrews[i,7]=0
  }
  
  
  #inner for each style
  for (j in 1:100) {
   
    if(buzzbrews[i,9]==styleCount[j,1]){
     
     #Compute ALC sum
     styleCount[j,7] <- styleCount[j,7]+buzzbrews[i,7]*100
     
     
     
     #Total of each beer count
     styleCount[j,6] <- styleCount[j,6]+1
     
     if(buzzbrews[i,7]==0){
       styleCount[j,6] <- styleCount[j,6]-1
     }
    
     
     
    }
    #Mean ABV for each style
    styleCount[j,5] <- (styleCount[j,7]/styleCount[j,6])/100
    }
}
#Add average column from style count to buzzbrews df
for (i in 1:iterations) {
  if(buzzbrews[i,7]==0){
    for(j in 1:100){
      if(buzzbrews[i,9]==styleCount[j,1]){
        buzzbrews[i,7]=styleCount[j,5]
        
      }
      }
  }
}
#kill NaN's for other alcohol types with no hops
i <- 1
for(i in 1:iterations){
  if(is.na(buzzbrews[i,8])){
    buzzbrews[i,8] <- 0
  }
}
#Check out end results - look for any leftover NAs in DF
sapply(buzzbrews, function(x) sum(is.na(x)))

## Brew_ID Brewery    City   State    Beer Beer_ID     ABV     IBU   Style  Ounces 
##       0       0       0       0       0       0       0       0       0       0

# Add in the regional data from the census bureau
buzzbrews <- merge(buzzbrews, regionData, by = "State")
# View new dataframe
view(buzzbrews)

######################
#                    #
#     Question 4     #
#                    #
######################
#############################################################################################################
#Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare
#############################################################################################################
# We computed the MedStateABV and IBU for each state and created a visualisation that allowed
# us to further explore what those medians tell us. We found there appears to be a relationship
# between IBU and ABV where we can use IBU to estimate ABV of a given beer. 
#
# We explored this further by developing a model to make predictions based on historical IBU
# and ABV data and were able to predict that a beer with 32 IBU could have an ABV of 5.72% and
# we were 97.5% confident that beer would at least fall between 3.24% and 8.21%.
#
#################################################################################################################
buzzbrews$State <- trimws(buzzbrews$State)
# Group by state and compute 
combineddf <- buzzbrews %>%
  group_by(State) %>%
  dplyr::summarise(MedStateIBU = median(IBU), MedStateABV = median(ABV))

## `summarise()` ungrouping output (override with `.groups` argument)

combineddf <- as.data.frame(combineddf)
combineddf$MedStateIBU <- as.numeric(combineddf$MedStateIBU)
combineddf$MedStateABV <- as.numeric(combineddf$MedStateABV)
# Divisional measurements
divisiondf <- buzzbrews %>% 
  group_by(Division) %>% 
  dplyr::summarise(MedDivIBU = median(IBU), MedDivABV = median(ABV))

## `summarise()` ungrouping output (override with `.groups` argument)

# round values to xx.x ###
divisiondf$MedDivIBU <- round(divisiondf$MedDivIBU, digits = 1)
divisiondf$MedDivABV <- round(divisiondf$MedDivABV, digits = 1) 
combineddf$MedStateIBU <- round(combineddf$MedStateIBU, digits = 1)
combineddf$MedStateABV <- round(combineddf$MedStateABV, digits = 1)
# Add regions to combinddf
combineddf <- merge(combineddf,regionData,by="State")
# Add in divisional values
combineddf <- merge(combineddf, divisiondf, by = "Division")
####### Create chart labels for stacked charts #####
combineddf$ABVlabel <- paste(combineddf$State, combineddf$MedStateABV)
combineddf$IBUlabel <- paste(combineddf$State, combineddf$MedStateIBU)
view(combineddf)
# Create sums of medians for labeling charts #
StateSums <- combineddf %>%
  group_by(Division) %>%
  dplyr::summarise(SumStateABV = sum(MedStateABV), SumStateIBU = sum(MedStateIBU))

## `summarise()` ungrouping output (override with `.groups` argument)

combineddf <- merge(combineddf, StateSums, by = "Division")
#
############################################################
#########                                   ################
######### Draw Bar Charts                   ################
#########                                   ################
############################################################
#
combineddf %>%
  ggplot(aes(x=Division, y=MedStateABV,fill= reorder(State,-MedStateABV))) +   
  # Create stacked by chart organized by Division with States stacked in each bar
  geom_bar(aes(color = "#c8102e"),stat="identity", width= 0.7, position = position_stack(), show.legend = FALSE) + 
  # Add state and ABV value to each state's chart position
  geom_text(aes(label = ABVlabel), size = 3, position = position_stack(vjust = 0.5)) + 
  # Add Division ABV Values to top of each chart stack
  geom_text(aes(Division, MedDivABV + SumStateABV -3, label = MedDivABV), size = 3, vjust = 1, fontface = "italic") +
  # Label the chart objects
  labs(title="Median ABV by State by US Census Division in the USA", 
       subtitle="Budweiser Consultation", 
       caption="source: ABV. ABV imputed where necessary.",
       y = "Alcohol By Volume",
       x = "States by US Census Divisions ") + 
  theme_classic() +
  # Remove y-labels and ticks since this is a stacked chart
  theme(axis.text.y = element_blank(), axis.ticks = element_blank()) +
  # Wrap X-axis labels
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10))

#
##### Create bar plot for IBU #####
#
combineddf %>%
  ggplot(aes(x=Division, y=MedStateIBU,fill= reorder(State,-MedStateIBU))) +   
  # Create stacked by chart organized by Division with States stacked in each bar
  geom_bar(aes(color = "#c8102e"),stat="identity", width= 0.7, position = position_stack(), show.legend = FALSE) + 
  # Add state and IBU value to each state's chart position
  geom_text(aes(label = IBUlabel), size = 3, position = position_stack(vjust = 0.5)) + 
  # Add Division IBU Values to top of each chart stack
  geom_text(aes(Division, MedDivIBU + SumStateIBU - 15, label = MedDivIBU), size = 3, vjust = 1, fontface = "italic") +
  # Label the chart objects
  labs(title="Median IBU by State by US Census Division in the USA", 
       subtitle="Budweiser Consultation", 
       caption="source: IBU. IBU imputed where necessary.",
       y = "Median Int'l Bitterness Unit",
       x = "States by US Census Divisions ") + 
  theme_classic() +
  # Remove y-labels and ticks since this is a stacked chart
  theme(axis.text.y = element_blank(), axis.ticks = element_blank()) +
  # Wrap X-axis labels
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10))

######################
#                    #
#     Question 5     #
#                    #
######################
#############################################################################################################
# Question 5 - Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?
#
# We determined that the maximum observed IBU was 138 in Oregon for Bitter Bitch Imperial IPA that 
# is an American Double/ Imperial IPA from the Astoria Brewing Company in Austoria, OR. 
# 
# We also determined that maximum observed ABV was 12.8% in Colorado for Lee Hill Series Vol. 5 – 
# Belgian Style Quadrupel Ale from Upslope Brewing Company in Boulder, CO.
#
#############################################################################################################
#Figure out which has highest ABV
MaxStateABV <- arrange(buzzbrews, desc(ABV))
print(MaxStateABV[1,4])

## [1] "Boulder"

#Figure out which has highest IBU
maxIBU <- arrange(buzzbrews,desc(IBU))
print(maxIBU[1,4])

## [1] "Astoria"

##### Question 5 Answer #####
## Colorado has the highest ABV = 12.8, Oregon has the highest IBU = 138.
###################################################
###### Create DF for just the max ABV & IBU values ######
# State measurements
maxStateValues <- buzzbrews %>% 
  group_by(State) %>% 
  dplyr::count(MaxStateABV = max(ABV), MaxStateIBU = max(IBU))
maxStateValues <- maxStateValues[,-4]
maxStateValues <- as.data.frame(maxStateValues)
maxStateValues$State <- trimws(maxStateValues$State)
str(maxStateValues)

## 'data.frame':    51 obs. of  3 variables:
##  $ State      : chr  "AK" "AL" "AR" "AZ" ...
##  $ MaxStateABV: num  6.8 9.3 6.1 9.5 9.9 ...
##  $ MaxStateIBU: num  71 103 45.7 99 115 ...

view(maxStateValues)
# Divisional measurements
divMaxValdf <- buzzbrews %>% 
  group_by(Division) %>% 
  dplyr::count(MaxDivABV = max(ABV), MaxDivIBU = max(IBU))
divMaxValdf <- divMaxValdf[,-4]
divMaxValdf <- as.data.frame(divMaxValdf)
# round values to xx.x ###
maxStateValues$MaxStateABV <- round(maxStateValues$MaxStateABV, digits = 1)
maxStateValues$MaxStateIBU <- round(maxStateValues$MaxStateIBU, digits = 1)
# Add regions to maxStateValues
maxStateValues <- merge(maxStateValues,regionData,by="State")
# Add in divisional values
maxStateValues <- merge(maxStateValues, divMaxValdf, by = "Division")
####### Create chart labels for stacked charts #####
maxStateValues$ABVmaxLabel <- paste(maxStateValues$State, maxStateValues$MaxStateABV)
maxStateValues$IBUmaxLabel <- paste(maxStateValues$State, maxStateValues$MaxStateIBU)
view(maxStateValues)
# Create sums of max values for labeling charts #
StateMaxSums <- maxStateValues %>%
  group_by(Division) %>%
  dplyr::summarise(SumStateABV = sum(MaxStateABV), SumStateIBU = sum(MaxStateIBU))

## `summarise()` ungrouping output (override with `.groups` argument)

maxStateValues <- merge(maxStateValues, StateMaxSums, by = "Division")
###################################################
###### Plot for Max ABV ###########################
maxStateValues %>%
  ggplot(aes(x=Division, y=MaxStateABV,fill= reorder(State,-MaxStateABV))) +   
  # Create stacked by chart organized by Division with States stacked in each bar
  geom_bar(aes(color = "#c8102e"),stat="identity", width= 0.7, 
           position = position_stack(), show.legend = FALSE) + 
  # Add state and ABV value to each state's chart position
  geom_text(aes(label = ABVmaxLabel), size = 3, position = position_stack(vjust = 0.5)) + 
  # Add Division ABV Values to top of each chart stack
  geom_text(aes(Division, MaxDivABV + SumStateABV, label = MaxDivABV), size = 3, 
            nudge_y = -7, fontface = "italic") +
  # Label the chart objects
  labs(title="Max ABV by State by US Census Division in the USA", 
       subtitle="Budweiser Consultation", 
       caption="source: ABV. ABV imputed where necessary.",
       y = "Alcohol By Volume",
       x = "States by US Census Divisions ") + 
  theme_classic() +
  # Remove y-labels and ticks since this is a stacked chart
  theme(axis.text.y = element_blank(), axis.ticks = element_blank()) +
  # Wrap X-axis labels
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10))

###################################################
############### Chart Max IBU #####################
maxStateValues %>%
  ggplot(aes(x=Division, y=MaxStateIBU,fill= reorder(State,-MaxStateIBU))) +   
  # Create stacked by chart organized by Division with States stacked in each bar
  geom_bar(aes(color = "#c8102e"),stat="identity", width= 0.7, 
           position = position_stack(), show.legend = FALSE) + 
  # Add state and ABV value to each state's chart position
  geom_text(aes(label = IBUmaxLabel), size = 3, position = position_stack(vjust = 0.5)) + 
  # Add Division ABV Values to top of each chart stack
  geom_text(aes(Division, MaxDivIBU + SumStateIBU, label = MaxDivIBU), 
            size = 3, nudge_y = -75, fontface = "italic") +
  # Label the chart objects
  labs(title="Max IBU by State by US Census Division in the USA", 
       subtitle="Budweiser Consultation", 
       caption="source: IBU imputed where necessary.",
       y = "Alcohol By Volume",
       x = "States by US Census Divisions ") + 
  theme_classic() +
  # Remove y-labels and ticks since this is a stacked chart
  theme(axis.text.y = element_blank(), axis.ticks = element_blank()) +
  # Wrap X-axis labels
  scale_x_discrete(labels = function(x) str_wrap(x, width = 10))

######################
#                    #
#     Question 6     #
#                    #
######################
#############################################################################################################
# Question 6 - Comment on the summary statistics and distribution of the ABV variable.
#
# We observed summary statistics from the ABV data showing that once we filled in the missing 
# values as best as we could, there was a range of 0.10% to 12.80% with a median of 5.65% (median
# is simply the middle value if we were to arrange all the ABVs in either descending or ascending order).
#
# We also found a very common range within the overall range that went from 5.0% ABV to 6.70% ABV and 
# upon further review noticed this is where many commonly mass produced beers fall, for example: Bud 
# Ice (5.5%), Bud Light Platinum (6%), Natural Ice (5.9%), Bud Ice (5.5%), Budweiser (5%), Blue Moon 
# (5%), Stella Artois (5%), Heinekin (5%), Pabst Blue Ribbon (4.74%) and Miller Genuinine Draft (4.6%).
#
#############################################################################################################
# Check on the distribution of ABV
# Add histogram of ABV distribution
buzzbrews %>%
  ggplot(aes(x = ABV)) +
  geom_histogram(binwidth = 1, color = "#ffffff", fill = "#c8102e") +
  # Adjust the scale to be able to mark-up the chart later in PowerPoint to match the 
  # summary statistics - 1Q, middle 50%, 4Q
  scale_x_continuous(breaks = seq(0, 14, by = 1)) +
  stat_bin(binwidth = 1, aes(label = ..count..), vjust = -0.5, geom = "text", size = 3) +
  # Adjust titles
  labs(title = "Histogram of ABV Distribution",
     subtitle = "Busweiser Consultation",
     x = "ABV Values",
     y = "Number of Beers",
     caption = "source: ABV imputed where necessary.") +
  theme_classic()

ABVsummary <- summary(buzzbrews$ABV)
ABVsummary

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.100   5.000   5.650   5.975   6.700  12.800

######################
#                    #
#     Question 7     #
#                    #
######################
#############################################################################################################
# Question 7 - Is there an apparent relationship between the bitterness of the beer and its 
# alcoholic content? Draw a scatter plot.  Make your best judgment of a relationship and 
# EXPLAIN your answer.
#
# We used a scatter plot to viusally explore if there is any sort of relationship between 
# IBU and ABV, in other words can IBU determine ABV or can ABV be used to determine IBU. 
# There was evidence of a positive relationship, but and we will discuss this further shortly, 
# it appears one can potentially predict the other.
#
#############################################################################################################
# Label Ales, IPAs and neither - this is also setting up for the next question, 
# but we want to be able to utilize it here
buzzbrews$IPAAle = case_when(grepl("\\bIPA\\b", buzzbrews$Beer, ignore.case = TRUE) ~ "IPA",
                             grepl("\\bindia pale ale\\b", buzzbrews$Beer, ignore.case = TRUE) ~ "IPA",
                             grepl("\\bale\\b", buzzbrews$Beer, ignore.case = TRUE ) ~ "Ale",
                             TRUE ~ "Neither")
#view(buzzbrews)
## Calculate slope and intercept of line of best fit ##
comparisonCoef <- coef(lm(ABV ~ IBU, buzzbrews))
comparisonCoef

## (Intercept)         IBU 
##  4.71799073  0.03142639

#  (Intercept)    MaxIBU 
#   4.71799073  0.03142639 
# Remove this chart if we decide to use the one labeled with IPA, Ale, Neither
buzzbrews %>% 
  ggplot(aes(x = IBU, y = ABV, color = "#c8102e")) +
  geom_point(show.legend = FALSE, na.rm = TRUE) +
  geom_abline(intercept =  comparisonCoef[1] , slope = comparisonCoef[2], color = "#c8102E", size = 1) +
  theme_classic() + 
  labs(title = "IBU vs ABV", 
       subtitle = "Budweiser Consultation",
       y = "Alcohol By Volume", 
       x = "Int'l Bitterness Unit",
       caption="ABV and IBU values imputed where necessary.")

buzzbrews %>% 
  ggplot(aes(x = IBU, y = ABV, color = IPAAle)) +
  geom_point(show.legend = TRUE, na.rm = TRUE) +
  geom_abline(intercept =  comparisonCoef[1] , slope = comparisonCoef[2], color = "#c8102E", size = 1) +
  theme_classic() + 
  labs(title = "IBU vs ABV", 
       subtitle = "Budweiser Consultation",
       y = "Alcohol By Volume", 
       x = "Int'l Bitterness Unit",
       caption="ABV and IBU values imputed where necessary.") +
  scale_color_manual(values = c("#c8102e","#13294b","#b1b3b3"),
                     name = "Type of Beer",
                     breaks = c("Ale", "IPA", "Neither"),
                     labels = c("Ale", "IPA", "Neither"))

DS6306: Cast Study 01

Ben Goodwin/ Justin Ehly

10/6/2020