df <- read.csv("Dropout Final Project Data.csv",skip=1,
stringsAsFactors=FALSE,na.strings=c("","‡")) %>%
slice(-c(1:3)) %>%
slice(-(61:nrow(.))) %>%
select(-starts_with("X")) %>%
rename(state=State.or.jurisdiction) %>%
filter(!is.na(state)) %>%
mutate(state=gsub("2","",state)) %>%
mutate_at(vars(starts_with("Grade")),
.funs=function(x) as.numeric(gsub(",","",x)))
#R> Warning in (function (x) : NAs introduced by coercion
#R> Warning in (function (x) : NAs introduced by coercion
#R> Warning in (function (x) : NAs introduced by coercion
#R> Warning in (function (x) : NAs introduced by coercion
#R> Warning in (function (x) : NAs introduced by coercion
#R> Warning in (function (x) : NAs introduced by coercion
#R> state Grade.7 Grade.8 Grade.9 Grade.10 Grade.11 Grade.12
#R> 1 Alabama NA NA 64569 53604 47538 42672
#R> 2 Alaska 10886 10857 11934 10664 9625 8766
#R> 3 Arizona 80011 78747 92028 82686 75384 71161
#R> 4 Arkansas NA NA 38359 35869 31995 28698
#R> 5 California 496757 502927 554935 501661 462940 412605
#R> 6 Colorado 59548 59397 64446 57678 52770 50387
df2 <- pivot_longer(df,-state,names_to="Grade",values_to="Dropouts")%>%
mutate(Grade=as.numeric(gsub("Grade.","",Grade)))
head(df2)
#R> # A tibble: 6 x 3
#R> state Grade Dropouts
#R> <chr> <dbl> <dbl>
#R> 1 Alabama 7 NA
#R> 2 Alabama 8 NA
#R> 3 Alabama 9 64569
#R> 4 Alabama 10 53604
#R> 5 Alabama 11 47538
#R> 6 Alabama 12 42672
p <- ggplot(data=df2,mapping=aes(x=state, y=Dropouts))+
geom_bar(stat="identity" ,color="grey", alpha=0.75, fill="wheat")+
scale_x_discrete(name="State")+
scale_y_continuous(name="Total Number of Student Dropouts", breaks=seq(0,100000,15000),limits=c(0,100000),expand=expansion(mult=c(0,0.01)))+
theme(legend.position = "none")+
theme_bw() +
theme(panel.grid.major.x=element_blank(), axis.text.x=element_text(angle=90))+
scale_fill_manual() +
scale_color_manual()+
labs(title="Dropouts per State in 2004-2005", x=element_blank(), y=element_blank())
p
#R> Warning: Removed 94 rows containing missing values (position_stack).
#R> Warning: Removed 101 rows containing missing values (geom_bar).
#Dropouts in the US per Grade
p2 <- ggplot(data=df2, mapping=aes(x=Grade, y=Dropouts)) +
geom_bar(stat="identity", fill="grey") +
scale_x_continuous(name="Grade", breaks=seq(7,12,1)) +
scale_y_continuous(name="Total Dropouts",breaks=seq(0,4000000,100000)) +
theme_bw()+
theme(axis.text.y=element_text(hjust=.25))+
labs(title="US Dropouts per Grade in 2004-2005", x=element_blank(), y=element_blank())
geom_hline(yintercept=0)
#R> mapping: yintercept = ~yintercept
#R> geom_hline: na.rm = FALSE
#R> stat_identity: na.rm = FALSE
#R> position_identity
#R> Warning: Removed 37 rows containing missing values (position_stack).