## ----listing files------------------------------------------------------------
datapath <- "../data/Africa RISING/Tanzania/Baseline/data"
ff <- list.files(datapath, pattern='\\.tab$', full=TRUE)
length(ff)


## ----Reading in the data------------------------------------------------------
x <- lapply(ff, read.delim)


## ----naming tables------------------------------------------------------------
z <- basename(ff)
z <- gsub(".tab$", "", z)
z <- substr(z, 5, nchar(z))
names(x) <- z


## ----Household tab------------------------------------------------------------
d <- x$interview


## ----Household groups---------------------------------------------------------
id <- 1:9
description <- c("AR", "IE, no coupon", "IE, one coupon", "IE, multiple coupons", 
                 "AR + IE, no coupon", "AR + IE, one coupon", "AR + IE, multiple coupons", 
                 "non-benificary in benificiary village", "control")

code <- c("AR", "no coupon", "coupon", "coupon", "no coupon", "coupon", "coupon", "IB", "cont.")

codetab <- data.frame(id=id, code=code, desc=description)
codetab


## ----newvar-------------------------------------------------------------------
d$tgroup <- code[codetab$desc]


## ----Household Data-----------------------------------------------------------
group <- as.factor(x$interview$group)
#There are many more groups in the survey than displayed in the summary statistics; some of the groups are combined, which the code below allows us to do.
levels(group) <- c("1", "2", "3", "3", "2", "3", "3", "8", "9")
levels(group) <- c("AR", "no.coupon", "coupon", "IB", "control")
district <- as.factor(x$interview$a2)
levels(district) <- c("Bab", "Kon", "Kit")
#Generates a table for number of each group in each district
table(district, group)


## ----Household Variables------------------------------------------------------
#First create a dataframe for household information
HH <- as.data.frame(x$interview$hhid)
colnames(HH)<-"hhid"

#To calcuate the household size, we make a table that counts number of individuals per household.
size <- as.data.frame(table(x$sectionB$hhid))
colnames(size) <- c("hhid", "size")

#This takes information from just the household head.
head <- x$sectionB[x$sectionB$b2 == 1,]

#Additional household variables
HH$group <- group
HH$distt <- district
HH$size <- size$size
HH$sex <-ifelse(head$b3 == 1, 0, 1)
HH$age <- head$b4a
#To calculate dependency rate, which is the number of people in household under 15 or over 65 divided by total household population
age <- x$sectionB[, c("hhid", "b4a")]
age$b4a[age$b4a<0] <- NA
age$b4a <- ifelse(age$b4a < 15| age$b4a > 65, 1, 0)

depend <- aggregate(age$b4a, list(age$hhid), sum)
depend <-as.data.frame(depend)
depend$count <- HH$size
HH$dependency <- depend$x/depend$count


## ----HH demographics cont.----------------------------------------------------

HH$no.school <-ifelse(head$b6 == "-95", 1, 0)
HH$prim.school <- ifelse(head$b6 >= 3 & head$b6<=11, 1, 0)
HH$sec.school <- ifelse(head$b6 >= 12, 1, 0)
HH$not.lit <- ifelse(head$b7 == "-95", 1, 0)
HH$kiswahili <- ifelse(head$b7 == "1", 1, 0)
HH$eng.kis <- ifelse(head$b7 == "3", 1, 0)
HH$ag <- ifelse(head$b8 =="1" | head$b8 == "2"| head$b8 == "4", 1, 0)
HH$married <- ifelse(head$b10 != "6", 1, 0)
HH$christian <- ifelse(x$interview$a19=="1", 1, 0)
HH$muslim <- ifelse(x$interview$a19 == "2", 1, 0)



## ----Summary Statistics Dataframe---------------------------------------------
mean <- apply(na.omit(HH[,4:17]), 2, mean)
sd <- apply(na.omit(HH[,4:17]), 2, sd)
dem <- data.frame(mean, sd)

#To get statistics by group
bygroup <- aggregate(HH[,4:17], list(HH$group),mean, na.rm=T)
#Save the names of each group, which is the first column
ngroup <- bygroup[,1]
#Transpose the table and create a dataframe
bygroup <- as.data.frame(t(bygroup[,-1]))
#Add back the name of each group
colnames(bygroup) <- ngroup
#Combine the summary information by group to the demographic dataframe
dem <- data.frame(dem, bygroup)


## ----by group and gender------------------------------------------------------
bydistrict <- aggregate(HH[,4:17], list(HH$distt),mean, na.rm=T)
n <- bydistrict[,1]
bydistrict <- as.data.frame(t(bydistrict[,-1]))
colnames(bydistrict) <- n
dem <- data.frame(dem, bydistrict)

#Finally, get summary statistics by gender
bygender <- aggregate(HH[,4:17], list(HH$sex), mean, na.rm=T)
bygender <- as.data.frame(t(bygender[,-1]))
colnames(bygender) <- c("male", "female")
dem <- data.frame(dem, bygender)
dem <- round(dem, 2)

knitr::kable(dem, caption ="Household Demographics")


## ----SizebyDistrict-----------------------------------------------------------
#This code generates different tables for each distrcit
Babati <- HH[district=="Bab",]
Kongwa <- HH[district=="Kon",]
Kiteto <- HH[district=="Kit",]

#Household size for each district
babsize <- tapply(Babati$size, Babati$group, mean)
konsize <- tapply(Kongwa$size, Kongwa$group, mean)
kitsize <- tapply(Kiteto$size, Kiteto$group, mean)

#Combine all district information
all <- cbind(babsize, konsize, kitsize)

#Create a barplot with the information on household sizes in each district and group
colors <-c("lightblue", "pink", "red", "darkblue", "darkgreen")
barplot(all, beside=T, horiz=T, col=colors)
#plot.new()
#legend("topright", rownames(all), fill=colors, xpd=T)



## ----District P Vals test-----------------------------------------------------
vars <- colnames(Kongwa)[4:17]

d <- data.frame(bab.vs.kit=rep(as.numeric(NA), length(vars)), bab.vs.kong=NA, kong.vs.kit=NA)
rownames(d) <- vars


for (i in seq_along(vars)){
  j <- vars[i] 
  d[i,1] <- t.test(Babati[[j]], Kiteto[[j]])$p.value
  d[i,2] <- t.test(Babati[[j]], Kongwa[[j]])$p.value
  d[i,3]<- t.test(Kongwa[[j]], Kiteto[[j]])$p.value
}

round(d,3)


## ----District P Vals test2----------------------------------------------------
p_to_star <- function(p) {
  ifelse(p <= .01, "***", ifelse(p < .05, "**", ifelse(p <= .1, "*", "")))
}  

d <- p_to_star(d)

kable(t(d))



## ----Summary of Assets--------------------------------------------------------
#First I remove two variables that are not necessary in the reshape to count number of assets. Group is the same for each household, and year is also not important in this process.
wide <- x$sectionO2[, -c(2,5)]
#Reshape the data from wide to long so that there are as many observations as households.
long <- reshape(wide, idvar="hhid", timevar = "assetid", direction = "wide")
#This code creates a dummy variable, so that households that have the asset are given a value of 1, regardless of how many they own.
long[,2:38] <-ifelse(long[,2:38] >= 1, 1, 0)

#Livestock assets are also found on another table. We similarly need to reshape soo that only 
livestock <- x$sectionJ1[,c(1,3,4)]
llivestock <- reshape(livestock, idvar="hhid", timevar = "j1_2", direction = "wide")


## -----------------------------------------------------------------------------
allasset <- cbind(long, llivestock)
#Remove duplicate household ID. 
allasset <- allasset[,-39]
#Here we add a few variables from a different table to this dataframe, and convert them to 0 1 variables. 
allasset$wall <- ifelse( x$sectionO1$o1 == 4, 1, 0)
allasset$floor <- ifelse(x$sectionO1$o2 == 4, 1, 0)
allasset$roof <- ifelse(x$sectionO1$o3 == 4, 1, 0)
allasset$water <- ifelse( x$sectionO1$o7 == 1, 1, 0)
allasset$light <- ifelse(x$sectionO1$o9 == 1, 1, 0)
#This changes the livestock columns from 1-2 values to 0-1.
allasset[,39:59] <- ifelse(allasset[,39:59] == 2, 0, 1)

#We can compare the means from all assets to table 8 of the summary report, and see that the two are the same.
means <-apply(allasset[,-1], 2, mean, na.rm=T)
means<- round(means, digits = 3)
#Two of the columns have values of all 0, and this lack of variation is not useful for our analysis.
allasset <- allasset[, -c(1,14,43)]
#To preserve the information on household ID, I change the rownames to match household ID. 
rownames(allasset) <- long$hhid


## ----Wealth Index-------------------------------------------------------------
#This function does PCA on the table of all assets. 
pr.out =prcomp (na.omit(allasset) , scale=TRUE)
#The first column in the output is the first principal component. We make this into a dataframe, and add a column for household ID. PC1 is used as the wealth index.
PC1 <- pr.out$x[,1]
PC1 <- as.data.frame(PC1)
PC1$hhid <- rownames(PC1)

#Now, we want to add a column that tells us the quintile of wealth. 
PC1.sorted <- PC1[order(PC1$PC1), ]
PC1.sorted$quintile <-1
PC1.sorted[160:321,3] <-2
PC1.sorted[322:483,3] <-3
PC1.sorted[484:645,3] <-4
PC1.sorted[646:809,3] <-5

#To replicate the chart, we take the mean and standard deviation and combine them into a single dataframe. 
quint.mean <- aggregate(PC1.sorted$PC1, list(PC1.sorted$quintile), mean)
quint.sd <- aggregate(PC1.sorted$PC1, list(PC1.sorted$quintile), sd)
quintiles <- data.frame(quint.mean, quint.sd)
quintiles <- quintiles[,-3]
colnames(quintiles) <- c("quintile", "mean", "SD")
kable(quintiles, caption= "Wealth Index by Quintile")


## ----BMI of Women-------------------------------------------------------------
#First create a table with just height and weight
BMI <- x$sectionT[,c("hhid", "t7a", "t8a")]
names(BMI) <- c("hhid", "weight", "height")
#The next codes first convert from cm to m, then square height and divide weight by height
BMI$height <- BMI$height/100
BMI$height <- BMI$height*BMI$height
BMI$BMI <- BMI$weight/BMI$height

#Here we merge the BMI data with the quintile data by household ID.
BMIquint <- merge(BMI, PC1.sorted, by="hhid")
#Next, we create a column that labels each of the BMI according to the distinction of overweight, normal, underweight, or obese. 
BMIquint$dist <-"underweight"
BMIquint$dist[BMIquint$BMI<25 & BMIquint$BMI>18.5] <- "normal"
BMIquint$dist[BMIquint$BMI<30 & BMIquint$BMI>=25] <- "overweight"
BMIquint$dist[BMIquint$BMI>=30] <- "obese"
BMIquint$dist[is.na(BMIquint$BMI)] <-NA


## ----Pie of BMI by quintile---------------------------------------------------
#Create a table for just the poorest households
poor <- BMIquint[BMIquint$quintile=="1",]
#The table function counts the number of individuals in each category
poortable <-table(poor$dist)
#To generate a percentage
pct <- round(poortable/sum(poortable)*100)
lbs <- c("normal", "overweight", "underweight")
#Labels of the pie chart
label<- paste(pct, "%", lbs)

#The same code is used for the medium and highest quintile individuals.
med<-BMIquint[BMIquint$quintile=="3",]
medtable <- table(med$dist)
pct2 <- round(medtable/sum(medtable)*100)
lbs2 <- c("obese", "normal", "overweight", "underweight")
label2<- paste(pct2, "%", lbs2)

rich <-BMIquint[BMIquint$quintile=="5",]
richtable <- table(rich$dist)
pct3 <- round(richtable/sum(richtable)*100)
lbs3 <- c("obese", "normal", "overweight", "underweight")
label3<- paste(pct3, "%", lbs3)

#Finally, we create all the pie charts in a single output. 
par(mfrow=c(1,3) )
pie(poortable, labels=label, main = "Lowest Quintile")
pie(medtable, label=label2, main = "Medium Quintile")
mtext(side=1, text="Weight by wealth quintile")
pie(richtable, label=label3, main = "Highest Quintile")


## ----means difference BMI by district-----------------------------------------
BMIdist <- merge(BMIquint, HH, by="hhid")
womendist <- aggregate(BMIdist$BMI, list(BMIdist$distt), mean, na.rm=T)

dist1 <- BMIdist$BMI[BMIdist$distt=="Bab"]
dist2 <- BMIdist$BMI[BMIdist$distt=="Kon"]
dist3 <- BMIdist$BMI[BMIdist$distt=="Kit"]
t.test(dist1, dist2, alternative="two.sided", conf.level=0.95)
t.test(dist2, dist3, alternative="two.sided", conf.level=0.95)
t.test(dist1, dist3, alternative="two.sided", conf.level=0.95)

#TODO: I tried to replicate table 14 but am confused about what is being compared in this table. The mean BMI for each category of weight, or the percent of each district that falls into each weight category.


## -----------------------------------------------------------------------------
#Begin community dataframe with type and population
community <- x$sectionCF[,c(3, 4)]
colnames(community) <- c("type", "population")
community$type <- ifelse(community$type=="1", "intervention", "control")
community$type <- as.factor(community$type)
community$elevation <- x$sectionCA$ca5c

#variables for the chairperson of the community
chairperson <- x$sectionCB[x$sectionCB$cb4 =="1",]

#Add variables for chair head
community$gender.chair <- ifelse(chairperson$cb2=="2", 1, 0)
community$age.chair <- chairperson$cb3 
community$years.chair <- chairperson$cb5

#Add variables for the informants in each community.
count <- as.data.frame(table(x$sectionCB$villageid))
community$no.inform <- count$Freq
x$sectionCB$villageid <-as.factor(x$sectionCB$villageid)

age.inform <-aggregate(x$sectionCB$cb3, list(x$sectionCB$villageid), mean)
community$age.inform <- age.inform$x

years.inform <- aggregate(x$sectionCB$cb5, list(x$sectionCB$villageid), mean)
community$years.inform <- years.inform$x

gender <- x$sectionCB$cb2
gender <- ifelse(x$sectionCB$cb2 == "2", 1, 0)
gender.inform <-aggregate(gender, list(x$sectionCB$villageid), mean)
community$gender.inform <- gender.inform$x


## -----------------------------------------------------------------------------
mean.overall <- colMeans(community[,2:10])

means <-aggregate(community[,2:10], list(community$type), mean)
means <- t(means)
means <- means[-1,]
colnames(means) <- c("intervention", "control")
means <- as.data.frame(means)

means$mean.overall <- mean.overall
kable(means, Caption = "Village, chairperson and informant characteristics")



## ----Population of Communities------------------------------------------------

pop <- data.frame(community$population)
pop$com <- rownames(pop)
#we can separate the observations into the three different districts 
pop$distt<- x$sectionCA$ca2
pop$distt <- as.factor(pop$distt)
levels(pop$distt) <- c("Kongwa", "Kiteto", "Babati")

#Order the communities by population, from large to small
pop2 <-pop[order(-pop$community.population),]

mycols <- c("blue", "darkblue", "lightblue")
barplot(pop2$community.population, col = mycols[pop2$distt], legend=levels(pop2$distt))

#TODO: Legend for barplot is messed up, and only shows two colors


## -----------------------------------------------------------------------------
#Extract the four crops that were asked on the survey
crops <- x$sectionCF[,c( "cf6a", "cf6b", "cf6c", "cf6d")]
colnames(crops)<- c("maize", "beans", "groundnut", "soybean")
#The remaining percentage of land area is labeled "other"
crops$other <- 100-crops$maize - crops$beans - crops$groundnut - crops$soybean
#Next, we add information on district and find the mean percentage dedicated to each crop in each district
crops$district <- x$sectionCA$ca2
crops2 <- aggregate(crops[,1:5], list(crops$district), mean)

#Next, we create  dataframes for each of the three districts.
cropBab <- crops2[1, 2:6]
cropBab <- t(cropBab)
cropBab <- round(cropBab, 1)

cropKong <- crops2[2, 2:6]
cropKong <-t(cropKong)

cropKit <- crops2[3, 2:6]
cropKit <- t(cropKit)
cropKit <-round(cropKit, 1)

#Finally, we display each district as a separate dataframe.
pie(cropKit[,1], col=colors, labels=cropKit[,1], main="Kiteto")
legend("left",legend=c("maize", "beans", "groundnut", "soybean", "other"), fill=colors, box.lty=0, title="Crops")
pie(cropBab[,1], col=colors, labels=cropBab[,1], main="Babati")
legend("left",legend=c("maize", "beans", "groundnut", "soybean", "other"), fill=colors, box.lty=0, title="Crops")
pie(cropKong[,1], col=colors, labels=cropKong[,1], main="Kongwa")
legend("left",legend=c("maize", "beans", "groundnut", "soybean", "other"), fill=colors, box.lty=0, title="Crops")