RESOURCES

Useful R codes for big data analysis

# 1. Load multiple layers from a geodatabase (e.g. plantations_v1_3_dl.gdb, downloadable from http://data.globalforestwatch.org/datasets/planted-forests)

# 2. Merge all layers into one spatial dataframe

# 3. Write the merge file into a shapefile.

# Developed by Jingjing Liang @7/17/2020. This script comes with absolutely no warranty. Use at your own risk.

library(sf)

layers <- st_layers(“C:/mydirectory/plantations_v1_3_dl.gdb”)
layers <- layers$name

shp1 <- sf::st_read(dsn = “C:/mydirectory/plantations_v1_3_dl.gdb”, layer=layers[1])

for (i in 2:length(layers)){
shp <- sf::st_read(dsn = “C:/mydirectory/plantations_v1_3_dl.gdb”, layer=layers[i])
shp1 <- rbind(shp1,shp)
}

st_write(shp1, dsn=paste0(“C:/mydirectory/plantaions_v1_3_shp/”, “plantation_merged.shp”),driver = “ESRI Shapefile”)

#### Code to Compile FIA data and compile tree list data for our class
### Developed by Jingjing Liang (albeca.liang@gmail.com), 2020. All rights reserved.

#Load FIA data sets, downloaded from FIA website

# Load tree-level data
tree_df<-read.csv(“TREE.csv”)
str(tree_df)
vars <- c(“PLT_CN”, “INVYR”, “STATUSCD”, “SPCD”, “DIA”, “TPA_UNADJ”)
tree_df2 <- tree_df[vars]
names(tree_df2) <- c(“PLT_CN”, “YR”, “STATUSCD”, “SPCD”, “DBH”, “TPA”)

# Load plot and species data
plot_df<-read.csv(“PLOT.csv”)
spp_df <-read.csv(“REF_SPECIES.csv”)

# Select needed variables from the Plot table
vars <- c(“CN”, “LAT”, “LON”, “ELEV”)
PLT_df <- plot_df[vars]
names(PLT_df)[1] <- “PLT_CN”

# Select needed variables from the species table
vars <- c(“SPCD”, “GENUS”, “SPECIES”)
SPP_df <- spp_df[vars]

# Extract plot coordinates and elevation to the tree table
df <- merge(tree_df2,PLT_df,by=”PLT_CN”)

# Extract species name to the tree table
df <- merge(df,SPP_df,by=”SPCD”)

# Dataset QC
#data <- na.omit(df)
data <- cbind.data.frame(seq(1,length(df[,1]),1), df)
names(data)[1] <- “FID”
summary(data)
dim(data)

# Number of trees by status
summary(as.factor(data$STATUSCD))

# Save tree dataset
write.csv(data, “TREEdata.csv”)

# Now let’s have some fun learning tree species in the United States

# Genera
levels(data$GENUS)
summary(data$GENUS)

# Species
species <- as.factor(paste(data$GENUS,” “,data$SPECIES))
levels(species)
summary(species)

# End of the code

#### Code to Compile FIA data for plot population structure
### Developed by Jingjing Liang (albeca.liang@gmail.com), 2021. All rights reserved.

# Load FIA tree data generated previously
tree_df <- read.csv(“FIA_TREEdf_AcerProj_04042021.csv”)
names(tree_df)

# Converting data to metric units
dat <- tree_df[!is.na(tree_df$DBH),]
dat <- dat[!is.na(dat$TPA),]
dat$DBH <- ifelse(dat$STATUSCD==1, dat$DBH*2.54,0) # inch to cm
dat$TPH <- ifelse(dat$STATUSCD==1, dat$TPA/0.405,0) # acre to hectare

## Add SPP group and DBH group identifiers

# SPP group
SPGP <- read.csv(“Acer_SpeciesGroup10.csv”)
names(SPGP) <- c(“X”,”species”,”SPGP”)

dat <- merge(dat,SPGP,by=”species”)
dat$SPGP <- as.factor(dat$SPGP)
summary(dat$SPGP)

# DBH group
DBHseq <- seq(5,31,2) * 2.54
DBHGP <- cut(dat$DBH, breaks=DBHseq, right=FALSE)
summary(DBHGP)

dat <- cbind.data.frame(dat,DBHGP)

dat$D_GP <- ifelse(is.na(dat$DBHGP) & dat$DBH>=78.7, 14, dat$DBHGP)
dat$D_GP <- ifelse(is.na(dat$DBHGP) & dat$DBH<12.7, 0, dat$D_GP)
dat$D_GP <- as.factor(dat$D_GP)
summary(dat$D_GP)

# Save Data
write.csv(dat,”TREE_AcerProj_labelled_04142021.csv”)

## Derive plot attributes and plot abundance matrix
rm(list = ls())
gc()
dat <- read.csv(“TREE_AcerProj_labelled_04142021.csv”)

# remove trees smaller than 5″ in DBH
dat <- subset(dat, dat$D_GP>0)

# Convert tree dataframe to plot abundance matrix
#dat$SP_D_GP <- paste0(dat$SPGP,”_”,dat$D_GP)
#levels(as.factor(dat$SP_D_GP))

#X <- tapply(dat$TPH, list(dat$PLT_CN, dat$SP_D_GP), sum)
#dim(X)

# Plot abundance matrix
#PAM <- as.data.frame(X)
#row.names(PAM) <- NULL
#PAM[is.na(PAM)] <- 0

# Calculate total number of live trees by plot
N <- tapply(dat$TPH, dat$PLT_CN, sum)
summary(N)

# Calculate total stand basal area
B <- tapply(3.14*dat$DBH^2/40000*dat$TPH, dat$PLT_CN, sum)
summary(B)

## Calculate plot diversity
# species diversity
X_SP <- tapply(dat$TPH, list(dat$PLT_CN, dat$SPGP), sum)
X_SP <- as.data.frame(X_SP)
X_SP[is.na(X_SP)] <- 0
X_SP$N <- rowSums(X_SP)

pi <- X_SP[,1:10]/X_SP$N
ln_pi <- log(pi)
Shannon <- pi*ln_pi
Shannon[is.na(Shannon)] <- 0
Hs <- rowSums(Shannon)*-1
summary(Hs)

# size diversity
X_DBH <- tapply(dat$TPH, list(dat$PLT_CN, dat$D_GP), sum)
X_DBH <- as.data.frame(X_DBH)
X_DBH[is.na(X_DBH)] <- 0
X_DBH$N <- rowSums(X_DBH)

pi <- X_DBH[,1:14]/X_DBH$N
ln_pi <- log(pi)
Shannon <- pi*ln_pi
Shannon[is.na(Shannon)] <- 0
Hd <- rowSums(Shannon)*-1
summary(Hd)


# Plot coordinates
LAT <- tapply(dat$LAT,dat$PLT_CN,mean)
LON <- tapply(dat$LON,dat$PLT_CN,mean)
summary(LON)
summary(LAT)

#plot(Coords_x, Coords_y)

# Plot ID and year
PLT_ID <- tapply(dat$PLT_CN,dat$PLT_CN,mean)
Year <- tapply(dat$YR,dat$PLT_CN,mean)

Plot_df <- cbind.data.frame(PLT_ID,Year,LON,LAT,B,N,Hs,Hd)
write.csv(Plot_df, “Acer_PLT_04222021.csv”)

# End of the code