African maize trials¶
Here we use data that was also used in a paper by Lobell et al. Nonlinear heat effects on African maize as evidenced by historical yield trials.
Get the data¶
The data is on CIMMYT’s dataverse.
https://data.cimmyt.org/dataset.xhtml?persistentId=hdl:11529/10548190
The ID is “hdl:11529/10548190” and we can download the data with the “agro” package. This package is under development. You can install it like this (if need be):
if (!require(agro)) source("https://install-github.me/reagro/agro")
## Loading required package: agro
Now we can use the package to download the data
ff <- agro::get_data_from_uri("hdl:11529/10548190", ".")
ff
## [1] "./hdl_11529_10548190/EIL_site_latlon.zip.gz"
## [2] "./hdl_11529_10548190/maizedata.lobell.sep2011.csv.zip.gz"
We have two “gz” (GNU zip) files. We can g-unzip these
print(R.utils::gunzip(ff[1], remove=FALSE, overwrite=TRUE))
## [1] "hdl_11529_10548190/EIL_site_latlon.zip"
## attr(,"nbrOfBytes")
## [1] 3259
print(R.utils::gunzip(ff[2], remove=FALSE, overwrite=TRUE))
## [1] "hdl_11529_10548190/maizedata.lobell.sep2011.csv.zip"
## attr(,"nbrOfBytes")
## [1] 1153763
Now we have zip files that we can unzip
unzip("hdl_11529_10548190/EIL_site_latlon.zip")
unzip("hdl_11529_10548190/maizedata.lobell.sep2011.csv.zip")
Read the data
location <- read.csv('EIL_site_latlon.csv', stringsAsFactors=FALSE)
maize_all <- read.csv('maizedata.lobell.sep2011.csv', stringsAsFactors=FALSE)
Explore¶
head(location)
## LocationID Country Location Region Latitude Longitude
## 1 5 ANGOLA ANGOLA Eastern and Southern Africa 0.00 0.00
## 2 6 ANGOLA CABINDA Eastern and Southern Africa -5.60 12.20
## 3 7 ANGOLA CACUSO Eastern and Southern Africa -9.42 15.75
## 4 8 ANGOLA CELA Eastern and Southern Africa -11.42 15.12
## 5 9 ANGOLA CHIANGA Eastern and Southern Africa -12.73 15.83
## 6 10 ANGOLA HUMPATA Eastern and Southern Africa -15.03 13.43
## ElevM
## 1 0
## 2 22
## 3 1067
## 4 1305
## 5 1693
## 6 1890
dim(maize_all)
## [1] 26142 37
maize <- maize_all[,c(1:9)]
maize$yield <- round(exp(maize$logYield), 1)
table(maize$Management)
##
## Drought Low N Low pH Maize Streak Virus
## 3244 2800 2032 353
## Optimal
## 17713
plot(sort(maize$yield))
par(mfrow=c(1,2))
boxplot(yield ~ Management, data=maize, horizontal=TRUE, las=2)
boxplot(yield ~ vargroup, data=maize, horizontal=TRUE, las=2)
library(maptools)
## Loading required package: sp
## Checking rgeos availability: TRUE
## Please note that 'maptools' will be retired during 2023,
## plan transition at your earliest convenience;
## some functionality will be moved to 'sp'.
data(wrld_simpl)
plot(location$Longitude, location$Latitude, col="red", pch=20)
plot(wrld_simpl, add=TRUE)
Combine¶
d <- merge(location, maize, by.x="LocationID", by.y="sitecode")
dsub <- d[, c("yield","Longitude", "Latitude", "Management", "vargroup", "Country" )]
da <- aggregate(yield ~ ., data=dsub, median)
#da <- aggregate(d[,"yield",drop=FALSE], d[,-1], data=dsub, median)
da <- da[order(da[,1], da[,2]), ]
head(da)
## Longitude Latitude Management vargroup Country yield
## 10 12.20 -5.60 Optimal EIHY ANGOLA 0.40
## 14 12.20 -5.60 Low N EPOP ANGOLA 0.30
## 16 12.20 -5.60 Low pH EPOP ANGOLA 0.85
## 31 12.20 -5.60 Low pH ILPO ANGOLA 0.30
## 40 12.20 -5.57 Optimal ILPO ANGOLA 0.00
## 4 13.43 -15.03 Optimal EIHY ANGOLA 4.50
table(da$Management, da$vargroup)
##
## EIHY EPOP ILHY ILPO
## Drought 10 15 8 9
## Low N 12 13 9 11
## Low pH 7 9 7 8
## Maize Streak Virus 1 2 1 1
## Optimal 68 69 51 60
dopt <- da[da$Management=="Optimal", ]
Model¶
head(da)
## Longitude Latitude Management vargroup Country yield
## 10 12.20 -5.60 Optimal EIHY ANGOLA 0.40
## 14 12.20 -5.60 Low N EPOP ANGOLA 0.30
## 16 12.20 -5.60 Low pH EPOP ANGOLA 0.85
## 31 12.20 -5.60 Low pH ILPO ANGOLA 0.30
## 40 12.20 -5.57 Optimal ILPO ANGOLA 0.00
## 4 13.43 -15.03 Optimal EIHY ANGOLA 4.50
library(randomForest)
m <- randomForest(yield ~ Longitude + Latitude, data=da)
m
##
## Call:
## randomForest(formula = yield ~ Longitude + Latitude, data = da)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 2.67184
## % Var explained: 40.28
p = predict(m)
plot(da$yield, p)
abline(0,1)
library(raster)
e <- extent(c(-22, 60, -37, 24))
aoi <- raster(ext=e, res=1/6)
pp <- interpolate(aoi, m, xyNames=c('Longitude', 'Latitude'))
pp <- mask(pp, wrld_simpl)
plot(pp)
points(da$Longitude, da$Latitude, col="blue")
lon = init(aoi, "x")
lat = init(aoi, "y")
s <- stack(lon, lat)
names(s) <- c("Longitude", "Latitude")
p2 <- predict(s, m)
bio <- getData("worldclim", var="bio", res="10")
## Warning in getData("worldclim", var = "bio", res = "10"): getData will be removed in a future version of raster
## . Please use the geodata package instead
plot(bio)
e <- extract(bio, da[, c("Longitude", "Latitude")])
de <- cbind(da[,"yield",drop=FALSE], e)
m <- randomForest(yield ~ ., data=de)
x <- predict(bio, m, ext=aoi)
plot(x)
de <- cbind(da, e)
head(de)
## Longitude Latitude Management vargroup Country yield bio1 bio2 bio3 bio4
## 10 12.20 -5.60 Optimal EIHY ANGOLA 0.40 248 75 55 1882
## 14 12.20 -5.60 Low N EPOP ANGOLA 0.30 248 75 55 1882
## 16 12.20 -5.60 Low pH EPOP ANGOLA 0.85 248 75 55 1882
## 31 12.20 -5.60 Low pH ILPO ANGOLA 0.30 248 75 55 1882
## 40 12.20 -5.57 Optimal ILPO ANGOLA 0.00 248 75 55 1882
## 4 13.43 -15.03 Optimal EIHY ANGOLA 4.50 173 133 65 1789
## bio5 bio6 bio7 bio8 bio9 bio10 bio11 bio12 bio13 bio14 bio15 bio16 bio17
## 10 309 173 136 265 219 265 219 912 170 0 83 477 2
## 14 309 173 136 265 219 265 219 912 170 0 83 477 2
## 16 309 173 136 265 219 265 219 912 170 0 83 477 2
## 31 309 173 136 265 219 265 219 912 170 0 83 477 2
## 40 309 173 136 265 219 265 219 912 170 0 83 477 2
## 4 265 63 202 182 147 188 144 796 169 0 89 425 0
## bio18 bio19
## 10 477 2
## 14 477 2
## 16 477 2
## 31 477 2
## 40 477 2
## 4 158 9
de$Country = NULL
de$Management <- as.factor(de$Management)
de$vargroup <- as.factor(de$vargroup)
m2 <- randomForest(yield~., data=de)
m2
##
## Call:
## randomForest(formula = yield ~ ., data = de)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 7
##
## Mean of squared residuals: 2.124513
## % Var explained: 52.51
varImpPlot(m2)
afbio <- crop(bio, s)
predictors <- stack(afbio, s)
df <- data.frame(Management="Optimal", vargroup="EPOP", stringsAsFactors = F)
df2 <- data.frame(Management="Optimal", vargroup="ILPO", stringsAsFactors = F)
#df <- data.frame(Management=5, vargroup=2)
pxd <- predict(predictors, m2, const=df2)
plot(pxd)
Now determine which vargroup performs best where, on average?
Do this for optimal and for drought?
when done, make it better…