diff --git a/DESCRIPTION b/DESCRIPTION index 287f5b7..efa30ec 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,27 +1,28 @@ -Package: TerrainWorksUtils -Type: Package -Title: General utility functions for TerrainWorks -Version: 0.1.3 -Authors@R: - c( - person("Helen", "Miller", email = "helenthehuman@gmail.com", role = c("aut", "cre")), - person("Tate", "Brasel", role = "aut") - ) -Description: TerrainWorks resources for creating and assessing statistical models for predicting landslide susceptibility. -License: MIT + file LICENSE -Encoding: UTF-8 -LazyData: true -Imports: - methods, - randomForest, - ROCR, - stats, - terra (>= 1.5) -RoxygenNote: 7.2.3 -Suggests: - knitr, - covr, - testthat (>= 3.0.0), - rmarkdown -Config/testthat/edition: 3 -VignetteBuilder: knitr +Package: TerrainWorksUtils +Type: Package +Title: General utility functions for TerrainWorks +Version: 0.1.3 +Authors@R: + c( + person("Helen", "Miller", email = "helenthehuman@gmail.com", role = c("aut", "cre")), + person("Tate", "Brasel", role = "aut") + ) +Description: TerrainWorks resources for creating and assessing statistical models for predicting landslide susceptibility. +License: MIT + file LICENSE +Encoding: UTF-8 +LazyData: true +Imports: + methods, + randomForest, + caret, + ROCR, + stats, + terra (>= 1.5) +RoxygenNote: 7.2.3 +Suggests: + knitr, + covr, + testthat (>= 3.0.0), + rmarkdown +Config/testthat/edition: 3 +VignetteBuilder: knitr diff --git a/R/build_models.R b/R/build_models.R index 68349ee..588c18e 100644 --- a/R/build_models.R +++ b/R/build_models.R @@ -33,7 +33,8 @@ #' @return Nothing #' @export #' -build_k_fold_rf_model <- function(data, +build_k_fold_model <- function(data, + type, seed = NULL, ctrl_method = "repeatedcv", folds = 5, @@ -44,6 +45,10 @@ build_k_fold_rf_model <- function(data, stop("Must provide data.") } + if (!type %in% c("rf", "glm")) { + stop("Must specify model type. Current options are: rf, glm") + } + if (!is.data.frame(data) | is.null(data$class)) { stop("Data must be a data frame (or coercible) with a \"class\" column") } @@ -68,7 +73,7 @@ build_k_fold_rf_model <- function(data, data = data, preProcess = preprocess, trControl = ctrl, - method = "rf", + method = type, metric = "AUC")) print(model) @@ -80,7 +85,6 @@ build_k_fold_rf_model <- function(data, } - #' @title Build Random Forest Model #' #' @param data A dataframe containing columns with model inputs and a diff --git a/R/create_model_dataframe.R b/R/create_model_dataframe.R new file mode 100644 index 0000000..01b1af9 --- /dev/null +++ b/R/create_model_dataframe.R @@ -0,0 +1,35 @@ +# define where to find the data and load it. +folder <- "//Mac/Home/Documents/terrainworks/code/sandbox/data/downloaded_3.6/out/" +folder <- "/Users/julialober/Documents/terrainworks/code/sandbox/data/downloaded_3.6/out/" +load(paste0(folder, "ls_1996.Rdata")) +load(paste0(folder, "ls_2007.Rdata")) +load(paste0(folder, "ls_2011.Rdata")) +load(paste0(folder, "nonls_1996.Rdata")) +load(paste0(folder, "nonls_2007.Rdata")) +load(paste0(folder, "nonls_2011.Rdata")) + +# remove these lines when the bug is fixed. the $geo field in the 2011 and 2007 +# data frames is not stored as an integer value +ls_2011$geo <- ls_2011$geo$GeoClass +ls_2007$geo <- ls_2007$geo$GeoClass + +# add year labels to each selection of landslide points and non-ls points +ls_1996$year <- rep(1996, length(ls_1996[, 1])) +ls_2007$year <- rep(2007, length(ls_2007[, 1])) +ls_2011$year <- rep(2011, length(ls_2011[, 1])) +nonls_1996$year <- rep(1996, length(nonls_1996[, 1])) +nonls_2007$year <- rep(2007, length(nonls_2007[, 1])) +nonls_2011$year <- rep(2011, length(nonls_2011[, 1])) + +# combine into one data frame. +subset96 <- seq(1, length(nonls_1996[,1]), 10) # 1:length(ls_1996[, 1]) +subset07 <- seq(1, length(nonls_2007[,1]), 10) # 1:length(ls_2007[, 1]) +subset11 <- seq(1, length(nonls_2011[,1]), 10) # 1:length(ls_2011[, 1]) +training_data <- rbind(ls_1996, + ls_2007, + ls_2011, + nonls_1996[subset96, ], + nonls_2007[subset07, ], + nonls_2011[subset11, ]) + +training_data$class <- as.factor(training_data$class) diff --git a/analysis/PFA_initiation_model.qmd b/analysis/PFA_initiation_model.qmd new file mode 100644 index 0000000..4ece060 --- /dev/null +++ b/analysis/PFA_initiation_model.qmd @@ -0,0 +1,612 @@ +--- +title: "PFA Landslide Initiation Model" +author: "Julia Lober" +format: + html: + toc: true + number-section: true + highlight-style: github +date: May 1, 2023 +editor: visual +params: + data_dir: + label: Data Directory + value: /Users/julialober/Documents/terrainworks/code/sandbox/data/downloaded_3.6/out/ + input: text + dem_dir: + label: Data Directory + value: /Users/julialober/Documents/terrainworks/code/sandbox/data/downloaded_3.6/in/ + input: text + pred_dir: + label: Data Directory + value: /Users/julialober/Documents/terrainworks/code/sandbox/data/predicting_input/ + input: text + init_pts: + label: Data Directory + value: /Users/julialober/Documents/terrainworks/code/sandbox/data/downloaded_3.6/in/all_initiation_points.shp + input: text +--- + +```{r setup} +#| echo: false +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + error = FALSE +) +``` + +```{r load, include = FALSE} +# Modeling packages +library(mlr3) +library(mlr3spatial) +library(mlr3spatiotempcv) +library(mlr3learners) +library(mlr3tuning) +library(mlr3viz) +library(mlr3fselect) +library(mlr3filters) +library(mlr3pipelines) +library(iml) + +# Packages for model-building +library(glmnet) +library(ranger) + +# Packages for other things +library(terra) +library(ggplot2) +library(viridis) +library(patchwork) +library(ggpubr) +library(maps) +library(tidyterra) +library(data.table) +library(tictoc) + +library(TerrainWorksUtils) +``` + +# Landslide initiation susceptibility model for the Private Forest Accord + +This document outlines the process for making the landslide initiation model for the Oregon Private Forest Accord project. We work with data generated from the PFA_Sampling_Multiple_DEMs document that is included in this package. The primary focus of this document is to show the model-building process and decisions that resulted in the final model used in combination with the landslide runout model to generate riparian buffers for the Private Forest Accord. Broadly speaking, there were 3 major decisions made in this process: + +1. A method for feature selection. + +2. The type of model built. + +3. A spatio-temporal resampling strategy. + +## The data + +Locate the data files and assemble into one data frame for model training. We use the natural log the partial contributing area values, as it shows a much more symmetrical distribution, which should be a better input for the model. + +```{r} +# Load all of the data +list_files <- c("ls_1996.Rdata", "ls_2007.Rdata", "ls_2011.Rdata", + "nonls_1996.Rdata", "nonls_2007.Rdata", "nonls_2011.Rdata") +capture.output(lapply(paste0(params$data_dir, list_files), load, .GlobalEnv), file = 'NUL') +ls_2011$geo <- ls_2011$geo$GeoClass +ls_2007$geo <- ls_2007$geo$GeoClass + +# Concatenate into one data frame +data <- rbind(transform(ls_1996, year = "1996"), + transform(ls_2007, year = "2007"), + transform(ls_2011, year = "2011"), + transform(nonls_1996, year = "1996"), + transform(nonls_2007, year = "2007"), + transform(nonls_2011, year = "2011")) + +sample_pnts <- data[!(is.na(data$dist_to_road)), 2:length(data)] +sample_pnts$geo <- as.factor(sample_pnts$geo) +sample_pnts$class <- as.factor(sample_pnts$class) + +# Take the natural log of the PCA values for a more uniform distribution +sample_pnts$ln_pca_k1_48 <- log(sample_pnts$pca_k1_48) +sample_pnts$ln_pca_k1_24 <- log(sample_pnts$pca_k1_24) +sample_pnts$ln_pca_k1_12 <- log(sample_pnts$pca_k1_12) +sample_pnts$ln_pca_k1_6 <- log(sample_pnts$pca_k1_6) + +# Add a few transformed parameters to look at +sample_pnts$gradient_sq <- sample_pnts$gradient ^ 2 +sample_pnts$grad_by_pca <- sample_pnts$gradient * sample_pnts$ln_pca_k1_48 + +``` + +```{r} +#| echo: false +points <- vect(cbind(sample_pnts[, "x"], y = sample_pnts[, "y"])) +points$yr <- sample_pnts$year +points$class <- sample_pnts$class +crs(points) <- "epsg:26910" +points <- project(points, "epsg:4326") + +load("/Users/julialober/Documents/terrainworks/code/sandbox/data/oregon_map.Rdata") + +ggplot() + + geom_polygon(data = oregon, mapping = aes(x = long, y = lat), + color = "white", fill = "white") + + geom_spatvector(data = points[points$class == "pos", ], + mapping = aes(color = as.factor(points[points$class == "pos", ]$yr), shape = as.factor(points[points$class == "pos", ]$yr)), + show.legend = TRUE, size = 1, alpha = 0.6) + + scale_color_manual(name = "Landslide points (count)", values = viridis(4)[1:3], labels = c("1996 (213)", "2007 (137)", "2011 (47)")) + + scale_shape_manual(name = "Landslide points (count)",values = c(1, 4, 17), labels = c("1996 (213)", "2007 (137)", "2011 (47)")) + + labs(x = "Longitude", y = "Latitude") +``` + +The data includes 397 landslide initiation points from the inventory assembled by the Oregon Department of Geology and Mineral Resources, and combine data points from a couple different storm studies. There are 3 broad temporal groupings roughly corresponding to the years 1996, 2007, and 2011. The majority of the points come from the 1996 storm study. + +The reader should refer for PFA_Sampling_Multiple_DEMs for details on how the negative points are selected. We use an imbalanced sample with 10x the number of non-landslide points as landslide points. The benefit of using more non-landslide points is a more accurate representation of the non-landslide area. An imbalanced sample for our linear regression model is likely to change the magnitude of the probabilities outputted but shouldn't change the marginal probabilities. Since much of our model analysis will be a comparison of proportions, where probabilities are integrated over total probabilities and area to produce a value for number of landslides, a uniform change in probabilities shouldn't impact our observations too much. + +### Covariates + +Candidate covariates were chosen based on our understanding of the physical properties that control landslide initiation. These include physical elevation derivatives calculated based on lidar DEMs, partial contributing area for a given storm duration, total contributing area, stand age, distance to a road, and geological sediment type. Although we look at models that include stand age and distance to a road, these covariates should actually be left out of the final model as they will change over time. + +The elevation derivatives that we consider are mean curvature and gradient (or slope angle). These were selected on the basis of the stability model which includes those as inputs for determining when a slope will fail. Steeper slopes are more likely to slide, as are more convergent slopes(?). Partial contributing area is calculated based on a hydraulic conductivity for the soil (which we assign a value of 1) and the storm duration. We looked at a variety of storm durations, since that reflects the conditions that we are actually interested in. + +It is helpful to look at the how the distributions of predictors vary between our landslide points and our non-landslide points. This informs our expectations for which features will be the most important for the model and increases our confidence in the results of feature selection processes. + +```{r} +#| echo: false + +p1 <- ggplot(data = sample_pnts) + + geom_density(aes(x = gradient, fill = class), alpha = 0.4, color = NA) + + labs(x = "Gradient", y = "") + + # scale_color_manual(values = c("black", "red"), labels = c("Non-landslide points", "Landslide points"), name = "") + + scale_fill_manual(values = c("black", "red"), labels = c("Non-landslide points", "Landslide points"), name = "") + +p2 <- ggplot(data = sample_pnts) + + geom_density(aes(x = mean_curv, fill = class), alpha = 0.4, color = NA) + + labs(x = "Mean curvature", y = "") + + scale_fill_manual(values = c("black", "red"), labels = c("Non-landslide points", "Landslide points"), name = "") + +p3 <- ggplot(data = sample_pnts) + + geom_density(aes(x = ln_pca_k1_48, fill = class), alpha = 0.4, color = NA) + + labs(x = "ln(PCA), 48 hr duration", y = "") + + scale_fill_manual(values = c("black", "red"), labels = c("Non-landslide points", "Landslide points"), name = "") + +p4 <- ggplot(data = sample_pnts) + + geom_density(aes(x = age, fill = class), alpha = 0.4, color = NA) + + labs(x = "Stand age", y = "") + + scale_fill_manual(values = c("black", "red"), labels = c("Non-landslide points", "Landslide points"), name = "") + + xlim(0, 300) + +p5 <- ggplot(data = sample_pnts) + + geom_density(aes(x = dist_to_road, fill = class), alpha = 0.4, color = NA) + + labs(x = "Distance to a road", y = "") + + scale_fill_manual(values = c("black", "red"), labels = c("Non-landslide points", "Landslide points"), name = "") + + xlim(0, 600) + +p6 <- ggplot(data = sample_pnts) + + geom_density(aes(x = grad_by_pca, fill = class), alpha = 0.4, color = NA) + + labs(x = "Gradient * ln(PCA), 48 hr duration", y = "") + + scale_fill_manual(values = c("black", "red"), labels = c("Non-landslide points", "Landslide points"), name = "") + +p7 <- ggplot(data = sample_pnts) + + geom_density(aes(x = ln_pca_k1_6, fill = class), alpha = 0.4, color = NA) + + labs(x = "ln(PCA), 6 hr duration", y = "") + + scale_fill_manual(values = c("black", "red"), labels = c("Non-landslide points", "Landslide points"), name = "") + + +p8 <- ggplot(data = sample_pnts) + + geom_density(aes(x = total_accum, fill = class), alpha = 0.4, color = NA) + + labs(x = "Total accumulation area", y = "") + + scale_fill_manual(values = c("black", "red"), labels = c("Non-landslide points", "Landslide points"), name = "") + + xlim(0, 150) + +# ggarrange(p1, p2, p7, p3, common.legend = TRUE) +combo1 <- p1 + p2 + p7 + p3 + + plot_layout(guides = "collect") & + theme(legend.position = "top") +combo1 +``` + +```{r} +#| echo: false +combo2 <- p4 + p5 + p6 + p8 + + plot_layout(guides = "collect") & + theme(legend.position = "top") +combo2 +``` + +These plots show us the indicate the features where the biggest differences are noticed between landslide points and non-landslide points. Notably, the distributions of gradient between landslide and non-landslide points do not vary as much as some of the other covariate candidates. This is surprising, given our physical understanding of slope angle as a major factor in landslide initiation. It is important to note that the sampled non-landslide points are limited to the range of gradient values observed in the landslide inventory. That is, any point with too low or too high of a slope angle is assumed to have zero probability of landslide initiation and is left out of this study. This filtering method is called an analysis mask and the details on what this looks like can be found in the PCA_Sampling_Multiple_DEMs document. + +Features with noticeably different distributions between landslide and non-landslide points include mean curvature and partial contributing area. + +## Logistic Regression model + +A logistic regression model was primarily because it is simple to code into FORTRAN, where the bulk of the data will be run through, and because it performed just as well as a Random Forest model during preliminary comparisons. + +A logistic regression model is a type of linear classifier. The classifier finds coefficients for the covariates that minimizes some error value or maximizes accuracy (this measure can be specified). We are not interested in a binary classification for the issue of landslide initiation, so we look at the probability that a given point is classified in the positive class; in other words, the probability landslide initiation at a location. + +We are using the `mlr3` ecosystem for modeling in R. This package is generally considered the newest development in machine learning modeling in R - it is more nimble than some other modeling packages and is being updated and expanded frequently. It operates using an object-oriented design. `Tasks` are used to establish what data you are training and testing with, as well as what the prediction goal is. We specify our task as a spatio-temporal task, which allows us to use a spatial resampling method. + +```{r} +training_cols <- c("x", "y", "gradient", "mean_curv", "dist_to_road", "total_accum", "geo", "age", "class", "ln_pca_k1_48", "ln_pca_k1_24", "ln_pca_k1_12", "ln_pca_k1_6", "gradient_sq", "grad_by_pca") + +training_cols_less <- !(training_cols %in% c("age", "dist_to_road")) + +ls_task <- as_task_classif_st( + x = sample_pnts[, training_cols], + id = "landslide_initiation", + target = "class", + positive = "pos", + coordinate_names = c("x", "y"), + crs = "epsg:26910" +) + +ls_task_less <- as_task_classif_st( + x = sample_pnts[, training_cols_less], + id = "landslide_initiation", + target = "class", + positive = "pos", + coordinate_names = c("x", "y"), + crs = "epsg:26910" +) +``` + +## Spatial resampling + +A common problem among modeling problems with a spatio-temporal aspect is that traditional resampling methods can lead to overfitting of models and overoptimistic estimations of how the model will perform when shown new data. With traditional resampling methods for training and test set splits, both sets are likely to have representation across the spatio-temporal spread of the data. Due to the nature of spatio-temporal data, new data is often outside the domain of the training data (either temporally or spatially), which means that we are training a model in one domain and then using it on data in a slightly different domain. Spatio-temporal resampling methods prevent this by assigning training and test sets based on spatial or temporal groupings, testing the model on a section of data that it hasn't seen before. This helps to prevent models from being overfit to the specific spatial or temporal patterns that we see in the data. + +There are many strategies for spatio-temporal resampling strategies, including k-means clustering, block cross-validation, or temporal validation strategies. The K-means clustering strategy is a good choice because it creates relatively even clusters even with uneven distributions of points, as we see in our data set. + +```{r} +set.seed(12345) +rsmp_nonsp <- rsmp("cv", + folds = 5) + +autoplot(rsmp_nonsp, ls_task, fold_id = c(1:5), pch = 20, alpha = 0.5) + + plot_layout(widths = 1) +``` + +```{r} +set.seed(12345) +rsmp_clus <- rsmp("spcv_coords", + folds = 5) +autoplot(rsmp_clus, ls_task, fold_id = c(1:5), pch = 20, alpha = 0.5) + + plot_layout(widths = 1) +``` + +Here we see the difference between non-spatial resampling and spatial resampling. + +Divisions based on year, or a combination of lithology and year were also considered, but were deemed impractical because of the imbalanced splits that they produced. + +## Feature Selection + +### Step-forward feature selection + +Choosing good features is an essential part of the machine learning process. A good model should use features that are correlated to the output class and independent from each other. The other main goal of feature selection is to choose the smallest possible set of features that produces a good model, which both reduces the runtime of training models and the risk of over-fitting. The definition of a "good model" can be difficult to define, but for our purposes, we use the area under the ROC curve (AUC) as a measure of model performance. A higher AUC indicates a better model, so the feature selection processes will try to maximize this value. + +Ideally, we could do an exhaustive search of all possible combinations of features, which leads to 2\^n possible combinations of features. With our 13 features, this is already 8192 possibly configurations. When you add a layer of cross validation resampling (we use 5 folds and 2 repetitions), the runtime of this task rapidly grows, we approach the 10,000s to 100,000s of models to evaluate. + +Using the AUC allows us to work in the `mlr3` ecosystem, but ultimately, we want to evaluate the model based on the proportions and generate a success-rate curve. So, we use a combination of filtering methods, feature selection, and success-rate curves to determine which features produce the best model. + +Instead, we do a forward stepping feature selection process, called `sequential` in `mlr3fselect`. First, a model is built with a single candidate predictor. The predictor from the single-variable model that performs the best is then chosen, and a two-variable model is trained with that variable and the remaining variable. These steps are repeated until there are no more variables left to add. The model with the highest measure is then chosen (here, AUC). Generally, the best model will turn out to be one of the models that does not include all of the predictors. We create an instance of the feature selector and define our model-type (`learner`), resampling method, evaluation measure, and give it the data (`task`). + +```{r} +run <- FALSE + +if (run) { + instance <- fselect( + fselector = fs("sequential"), + task = ls_task, + learner = lrn("classif.log_reg", predict_type = "prob"), + resampling = rsmp("repeated_spcv_coords", + folds = 5, + repeats = 3), + measure = msr("classif.auc") + ) +} +``` + +For our task, the best model use age, distance to a road, partial contributing area (48 hr), and mean curvature. + +Let's do the same thing, but leave out stand age and distance to road. We are interested in how these variables affect the landslide initiation probability, but don't want to include them in the final model that is linked with the runout model. + +```{r} +run <- FALSE + +if (run) { + instance <- fselect( + fselector = fs("sequential"), + task = ls_task_less, + learner = lrn("classif.log_reg", predict_type = "prob"), + resampling = rsmp("repeated_spcv_coords", + folds = 5, + repeats = 3), + measure = msr("classif.auc") + ) +} +``` + +### Feature filtering + +Another way of evaluating feature importance is through a filter. Filters use a correlation measure (of which there are many to choose from) to evaluate which features are most tied to the output class. A major difference between filters and other feature selection algorithms is that these methods do evaluate which features perform best in a model; they only look at the correlation between two variables. Filters are often used when the number of features is large enough that a feature selection algorithm is impractical to run in a reasonable amount of time. Here, we use it to gather more information about which features may be the most useful. + +```{r} +info_filter <- flt("information_gain") +info_filter$calculate(ls_task) +res_info <- as.data.table(info_filter) +res_info$rank <- 1:length(res_info$feature) + +cmim_filter <- flt("cmim") +cmim_filter$calculate(ls_task) +res_cmim <- as.data.table(cmim_filter) +res_cmim$rank <- 1:length(res_cmim$feature) + +jmi_filter <- flt("jmi") +jmi_filter$calculate(ls_task) +res_jmi <- as.data.table(jmi_filter) +res_jmi$rank <- 1:length(res_jmi$feature) + +relief_filter <- flt("relief") +relief_filter$calculate(ls_task) +res_relief <- as.data.table(relief_filter) +res_relief$rank <- 1:length(res_relief$feature) +``` + +The output of these filters has been scaled and combined in the figure below. Each different method showed relatively similar ordering of the variables. Of note is that the longer durations for the partial contributing areas are consistently given better scores than shorter durations. + +```{r} +#| echo: false +filter_res <- res_info +filter_res$InfoGain <- (res_info$score / max(res_info$score)) +filter_res$CondMIM <- (res_cmim$score / max(res_cmim$score)) +filter_res$JMI <- (res_jmi$score / max(res_jmi$score)) +filter_res$Relief <- (res_relief$score / max(res_relief$score)) + +filter_res.m <- melt(filter_res[, c("feature", "InfoGain", "CondMIM", "JMI", "Relief")]) + +ggplot(filter_res.m) + + geom_bar(aes(x = reorder(feature, -value), y = value, fill = variable), stat = "identity", position = "stack") + + scale_fill_manual(values = viridis(5)[1:4], labels = c("Information Gain", "Minimal Conditional \nMutual Information \nMaximization", "Joint Mutual Information", "Relief"), name = "Filter name") + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, face = "bold"), + axis.text.y = element_blank(), + axis.ticks.y = element_blank()) + + labs(x = "", y = "Score") + + scale_x_discrete(labels = c("ln(pca), 48 hrs", "mean curvature", "ln(pca), 24 hrs", "total accumulation", "gradient * pca, 48 hrs", "ln(pca), 12 hrs", "ln(pca), 6 hrs", "gradient^2", "gradient", "distance to a road", "geology", "stand age")) + + facet_grid(variable ~ .) + + theme(legend.position = "none") + +``` + +## Training the model + +From the feature selection processes, we choose a handful of possible feature selection algorithms and do a more in-depth analysis using success-rate curve as the metric for model performance. This involves training a model and then using the model to predict on DEMs. + +We start with a model that uses gradient, 48-hour partial contributing area, and the product of the two. + +```{r} +set.seed(12345) +ls_task$select(c("ln_pca_k1_48", "mean_curv", "gradient")) + +# create a pipeline for data preprocessing +scale_features <- + po("scale") %>>% + po("encode") +scale_features$train(ls_task) +ls_task_eng <- scale_features$predict(ls_task)$encode.output + +# define the type of model +ls_model <- lrn("classif.log_reg", + predict_type = "prob") + +model <- mlr3::resample( + task = ls_task_eng, + learner = ls_model, + resampling = rsmp("repeated_spcv_coords", + folds = 5, + repeats = 50) +) + +model$aggregate(msr("classif.auc")) +``` + +First, we run a whole bunch of spatial resampling iterations to get a confident estimate of how well the model will perform. For this estimation, we are only able to look at measures that can be tested in the framework of `mlr3`. We look at area under the ROC curve (AUC). + +Our final model is trained on all of the data, to maximize the use of our landslide inventory. + +```{r} +ls_model$train(ls_task_eng) +ls_model$model +``` + +### Feature Importance + +```{r} +# set up the data frame + +# define a machine-learning task +task <- as_task_classif_st( + x = sample_pnts[, training_cols], + id = "landslide_initiation", + target = "class", + positive = "pos", + coordinate_names = c("x", "y"), + crs = "epsg:26910" +) + +# define the model to apply +log_reg <- lrn("classif.log_reg", + predict_type = "prob") +log_reg$train(task) # train (calibrate) the model on the entire data set + +# set up a data frame with just the features +x = sample_pnts[, training_cols] + +# create a new instance of an iml Predictor object for this model +model = iml::Predictor$new(log_reg, data = x, y = sample_pnts$class) + +# Calculate importance for all features +importance = iml::FeatureImp$new(model, loss = "ce") # "ce" is a measure of classification error +num_features = c("Gradient", "Tan_Curv", "Norm_Curv", "age", "geo") +importance$plot(features = num_features) + + labs(title = "Permutation Feature Importance", + subtitle = "For predicting landslide initiation") +``` + +## Predicting + +### Single basin + +The predicting directory must have the elevation derivative files for the required predictors. + +```{r} +pred_dir <- params$pred_dir + +# find and load the data files +topo_files <- c(paste0("GRADIENT,", pred_dir, "Umpqua/basin_1/gradient.flt"), + paste0("MEAN CURVATURE,", pred_dir, "Umpqua/basin_1/mean_curv.flt")) +topo_rast <- elev_deriv(rasters = topo_files) +pca_rast <- contributing_area(raster = paste0(pred_dir, "Umpqua/basin_1/pca_6_1.flt")) +rasters <- c(topo_rast, pca_rast) +names(rasters) <- c("gradient", "mean_curv", "pca_k1_48") + +# convert to a data frame and add the feature interactions. +pd <- as.data.frame(rasters, xy = TRUE) +pd$grad_by_pca <- pd$gradient * pd$pca_k1_48 +pd$ln_pca_k1_48 <- log(pd$pca_k1_48) + +# manually pre process the data: take the log of the pca and center+scale the data +center_vals <- scale_features$pipeops$scale$state$center +scale_vals <- scale_features$pipeops$scale$state$scale + +# pd$grad_by_pca <- (pd$grad_by_pca - center_vals[1]) / scale_vals[1] +pd$gradient <- (pd$gradient - center_vals[1]) / scale_vals[1] +pd$ln_pca_k1_48 <- (pd$ln_pca_k1_48 - center_vals[2]) / scale_vals[2] +pd$mean_curv <- (pd$mean_curv - center_vals[3]) / scale_vals[3] + +pd <- subset(pd, select = c("x", "y", "mean_curv", "gradient", "ln_pca_k1_48")) +``` + +```{r} +# predict the basin using scaled and centered data. +predictions_df <- as.data.table(ls_model$predict_newdata(pd)) +predictions_df <- cbind(pd, predictions_df) +``` + +Here we predict on a sub-basin of the larger Umpqua watershed. + +```{r} +pts <- terra::vect(params$init_pts) +pts_local <- crop(pts, rasters) + +ggplot(predictions_df) + + geom_tile(aes(x = x, y = y, fill = prob.pos)) + + scale_fill_viridis(option = "A", direction = -1) + + geom_spatvector(data = pts_local) +``` + +### Multiple basins + +To generate a success rate curve, we combine multiple basins. The success rate curve is the proportions of landslide predicted over the proportion of the predicted area. A smaller area under the success rate curve indicates that the model was more specific with the areas that it deemed to have a high probability of landslide initiation. + +```{r} +basin_list <- list.files(params$dem_dir, paste0("^basin_[1-9]{1,3}\\.flt"), recursive = TRUE) +# basin_list <- list.dirs(pred_dir, recursive = TRUE, full.names = FALSE) +predict = TRUE +``` + +```{r} +model <- ls_model +bins <- as.data.table(seq(0, 1, length.out = 500)) +names(bins) <- c("probability") + +count <- 1 +cumul_prop <- data.frame("area" = rep(0, 500), + "prob" = rep(0, 500)) +pts_prop <- data.frame("x" = 0, + "y" = 0, + "prob.pos" = 0, + "prob.cumsum" = 0, + "area.cumsum" = 0, + "ls.prop" = 0) + +if (predict) { + for (basin in basin_list) { + + basin <- sub("\\..*","", basin) + + tic(msg = paste0("loop for ", basin)) + + # find and load the data files + topo_files <- c(paste0("GRADIENT,", pred_dir, basin, "/gradient.flt"), + paste0("MEAN CURVATURE,", pred_dir, basin, "/mean_curv.flt")) + print(topo_files) + topo_rast <- elev_deriv(rasters = topo_files) + + pca_file <- list.files(paste0(pred_dir, basin), "^pca_6_[0-9]{1,3}\\.flt", full.names = TRUE) + pca_rast <- contributing_area(raster = pca_file) + count <- count + 1 + rasters <- c(topo_rast, pca_rast) + names(rasters) <- c("gradient", "mean_curv", "pca_k1_48") + + predicting_data <- as.data.frame(rasters, xy = TRUE) + predicting_data$ln_pca_k1_48 <- log(predicting_data$pca_k1_48) + # predicting_data$grad_by_pca <- predicting_data$gradient * predicting_data$pca_k1_48 + + pd <- predicting_data + # pd <- subset(pd, select = c("x", "y", "grad_by_pca", "gradient", "ln_pca_k1_48")) + pd <- subset(pd, select = c("x", "y", "mean_curv", "gradient", "ln_pca_k1_48")) + + + # manually pre process the data: take the log of the pca and center+scale the data + center_vals <- scale_features$pipeops$scale$state$center + scale_vals <- scale_features$pipeops$scale$state$scale + + # pd$grad_by_pca <- (pd$grad_by_pca - center_vals[1]) / scale_vals[1] + pd$gradient <- (pd$gradient - center_vals[1]) / scale_vals[1] + pd$ln_pca_k1_48 <- (pd$ln_pca_k1_48 - center_vals[2]) / scale_vals[2] + pd$mean_curv <- (pd$mean_curv - center_vals[3]) / scale_vals[3] + + # predict the basin using scaled and centered data. + predictions_df <- as.data.table(model$predict_newdata(pd)) + predictions_df <- cbind(predictions_df, pd) + + proportions <- as.data.table(subset(predictions_df, select = c("x", "y", "prob.pos"))) + setorder(proportions, cols = "prob.pos") + + # calculate the cumulative sums of area and probabilities + proportions$prob.cumsum <- cumsum(proportions$prob.pos) + prob_sum <- max(proportions$prob.cumsum) + proportions$area.cumsum <- seq(1, length(proportions$x)) + area_sum <- max(proportions$area.cumsum) + + setattr(proportions, "sorted", "prob.pos") + + binned_prop <- proportions[J(bins$probability), roll = "nearest"] + cumul_prop$area <- cumul_prop$area + binned_prop$area.cumsum + cumul_prop$prob <- cumul_prop$prob + binned_prop$prob.cumsum + + toc() +} +} + +total_area <- max(cumul_prop$area) +total_prob <- max(cumul_prop$prob) + +cumul_prop$area_prop <- cumul_prop$area / total_area +cumul_prop$prob_prop <- cumul_prop$prob / total_prob + +dt <- cbind(bins, cumul_prop)[J(pts_prop$prob.pos), roll = "nearest", on = "probability"] +pts_prop$area_prop <- dt$area_prop +``` + +```{r} +load(paste0(pred_dir, "grad_pca_gbyp.Rdata")) +success_curve_2 <- cbind(bins, cumul_prop) + +ggplot(success_curve) + + geom_line(aes(x = area_prop, y = prob_prop), color = "red") + + geom_point(aes(x = area_prop, y = prob_prop), color = "red") + + geom_line(data = success_curve_2, aes(x = area_prop, y = prob_prop), color = "blue") + + geom_point(data = success_curve_2, aes(x = area_prop, y = prob_prop), color = "blue") + + scale_x_continuous(limits = c(0, 1), expand = c(0, 0)) + + scale_y_continuous(limits = c(0, 1), expand = c(0, 0)) + + labs(x = "Proportion of area", y = "Proportion of total probability") + + theme(legend.position = "right") + + theme(panel.grid = element_blank(), + panel.background = element_rect(fill = "white", linewidth = 1, color = "black",), + panel.border = element_rect(linewidth = 1, color = NA, fill = NA)) +``` diff --git a/analysis/comparing_landslide_models.qmd b/analysis/comparing_landslide_models.qmd new file mode 100644 index 0000000..43cbe79 --- /dev/null +++ b/analysis/comparing_landslide_models.qmd @@ -0,0 +1,355 @@ +--- +title: "Modeling Landslide Initiation" +autho: Julia Lober +format: html +editor: visual +--- + +```{r setup} +#| echo: false +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + error = FALSE +) +``` + +```{r load, include = FALSE} +library(mlr3) +library(mlr3spatial) +library(mlr3spatiotempcv) +library(mlr3learners) +library(mlr3tuning) +library(mlr3viz) +library(mlr3fselect) +library(mlr3filters) +library(mlr3pipelines) + +# Packages for model-building +library(glmnet) +library(ranger) + +# Packages for other things +library(terra) +library(ggplot2) +library(maps) +library(tidyterra) +``` + +```{r} +source("../R/create_model_dataframe.R") +sample_pnts <- training_data[!(is.na(training_data$dist_to_road)), 2:length(training_data)] +sample_pnts$geo <- as.factor(sample_pnts$geo) +# sample_pnts <- subset(sample_pnts, select = -c(year)) + +sample_pnts$gradient_sq <- sample_pnts$gradient ^ 2 +sample_pnts$grad_by_pca <- sample_pnts$gradient * sample_pnts$pca_k1_48 +``` + +## A look at the data. + +The first step is to look at the data we are using to built the model. We are working with data files that have been assembled by the PFA_Sampling_Multiple_DEMs documents included in this package. This package produces .Rdata files that contain data frames of landslide points divided by the approximate year in which they occurred and non-landslide points sampled from the surrounding area divided in the same way. + +The data frames contain a variety of predictors selected based on the physical parameters that we know control landslides. These predictors have various sources of data which will be explained later. + +```{r} +#| echo: false +head(sample_pnts) +``` + +```{r} +#| echo: false +points <- vect(cbind(sample_pnts[, "x"], y = sample_pnts[, "y"])) +points$yr <- sample_pnts$year +points$class <- sample_pnts$class +crs(points) <- "epsg:26910" +points <- project(points, "epsg:4326") + +load("/Users/julialober/Documents/terrainworks/code/sandbox/data/oregon_map.Rdata") + +ggplot() + + geom_polygon(data = oregon, mapping = aes(x = long, y = lat), + color = "white", fill = "white") + + # geom_spatvector(data = points[points$class == "neg", ], + # mapping = aes(shape = as.factor(points[points$class == "neg", ]$yr)), + # show.legend = TRUE, size = 1 , color = "black", alpha = 1) + + geom_spatvector(data = points[points$class == "pos", ], + mapping = aes(shape = as.factor(points[points$class == "pos", ]$yr)), + show.legend = TRUE, size = 1, color = "red", alpha = 0.4) + + scale_shape_manual(values = c(3, 1, 17), labels = c("1996 (213)", "2007 (137)", "2011 (47)")) + + labs(x = "Longitude", y = "Latitude", shape = "Study points (count)") +``` + +Note that the landslide initiation points tend to come in clusters and the years 1996 and 2007 have much many more points located than 2011. Negative landslide points are randomly selected within a buffer area near the points. There are other documents that explain the steps and decisions to assemble this data set, so I will not go into it any further. + +## Defining the modeling task and learner + +For `mlr3`, we need to define the task, the type of model (called a learner), and the resampling method that we want to use. The task is our landslide initiation data set and the learner (for now) is a logistic regression model. If we want to add a different type of model, this will be done in a different document. + +There are several steps we need to work through: + +1. Feature selection - Many options exist for feature selection methods that vary slightly in the way that they attribute value to different predictors and iterate through subsets. We explore a small selection in the feature_selection.qmd document, select a basic method from there. +2. Tuning hyperparameters - This step is only necessary if we decide to use a model that has hyperparameters, like a random forest model. Otherwise, we will ignore it. +3. Estimating performance - Here, we use a spatial cross-validation resampling method to assess the model performance while minimizing bias. By using spatial cross-validation, we keep testing and training sets closer to independent inputs, which should give us a more realistic guess of how accurate the model will on new data. +4. Generating a proportion raster - Once we have the best model option as assessed from the previous steps, we can input all of the DEMs to produce a probability raster, which we will then convert to a proportion raster. + +```{r} +set.seed(12345) +sample_pnts <- subset(sample_pnts, select = -c(year)) + +landslide_task <- as_task_classif_st(x = sample_pnts, + id = "landslide_initiation", + target = "class", + positive = "pos", + coordinate_names = c("x", "y"), + crs = "epsg:26910") + +logreg_learner <- lrn("classif.log_reg", + predict_type = "prob") + +rf_learner <- lrn("classif.ranger", + predict_type = "prob") + +spat_resamp <- rsmp("repeated_spcv_coords", + folds = 5, + repeats = 2) +``` + +We will design a pipeline to compare the different models. + +```{r} +#| echo: false +autoplot(spat_resamp, + landslide_task, + fold_id = c(1:4)) +``` + +### 1. Feature selection + +The rule in machine learning is to create the best model possible with the fewest features needed. This not only makes the model quicker to train, but also makes it easier to predict (smaller data inputs) and also reduces the risk of over-fitting. + +```{r} +logreg_fselect <- auto_fselector( + fs("sequential"), + learner = logreg_learner, + resampling = spat_resamp, + measure = msr("classif.auc")) + +rf_fselect <- auto_fselector( + fs("sequential"), + learner = rf_learner, + resampling = spat_resamp, + measure = msr("classif.auc")) +``` + +## Make a pipeline + +```{r} + +pipe <- + po("encode") %>>% + ppl("branch", list( + "anova" = po("filter", filter = flt("anova")), + "nothing" = po("nop")), prefix_branchops = "filt") %>>% + ppl("branch", list( + "lr" = logreg_learner, + "rf" = rf_learner, + "lr_fs" = logreg_fselect, + "rf_fs" = rf_fselect)) + +pipe$plot(html = FALSE) +``` + +Now, we can define the parameters over which to evaluate the models and tune it. I use a random search to start just gathering an idea of the best ranges of parameters. Since we are comparing a handful of different modeling techniques, one of the major parameters to tune will be the model type, called "branch.selection" in the parameter set. + +I ran this initially with a filtering method as an option, but found the no matter the filtering, the forward feature selection works better. It is possible that combining filtering and forward feature would reduce run-time, but since we only have about 15 features to evaluate, I don't believe that the time save will be substantial enough for the possible performance trade-off. + +```{r} +param_set <- ParamSet$new(list( + ParamDbl$new("anova.filter.frac", lower = 0.3, upper = 1), + ParamFct$new("filtbranch.selection", levels = c("anova", "nothing")), + ParamInt$new("classif.ranger.mtry", lower = 1L, upper = 11L), + ParamFct$new("branch.selection", levels = c("lr", "rf", "lr_fs", "rf_fs")) +)) + +target <- ti( + task = landslide_task, + learner = as_learner(pipe), + resampling = rsmp("spcv_coords", folds = 3), + measures = msr("classif.auc"), + terminator = trm("evals", n_evals = 20), + search_space = param_set + +) + +tuner <- tnr("random_search") + +# tuner$optimize(target) +``` + +```{r} +param_set <- ParamSet$new(list( + ParamDbl$new("anova.filter.frac", lower = 0.3, upper = 0.3), + ParamFct$new("filtbranch.selection", levels = c("nothing")), + ParamInt$new("classif.ranger.mtry", lower = 1L, upper = 11L), + ParamFct$new("branch.selection", levels = c("lr", "rf", "lr_fs")) +)) + +target <- ti( + task = landslide_task, + learner = as_learner(pipe), + resampling = rsmp("spcv_coords", folds = 3), + measures = msr("classif.auc"), + terminator = trm("evals", n_evals = 20), + search_space = param_set + +) + +tuner <- tnr("random_search") + +# tuner$optimize(target) +``` + +```{r} +param_set <- ParamSet$new(list( + ParamDbl$new("anova.filter.frac", lower = 0.3, upper = 0.3), + ParamFct$new("filtbranch.selection", levels = c("nothing")), + ParamInt$new("classif.ranger.mtry", lower = 1L, upper = 5L), + ParamFct$new("branch.selection", levels = c("rf_fs")) +)) + +target <- ti( + task = landslide_task, + learner = as_learner(pipe), + resampling = rsmp("spcv_coords", folds = 3), + measures = msr("classif.auc"), + terminator = trm("none"), + search_space = param_set + +) + +tuner <- tnr("grid_search", resolution = 5, batch_size = 400) + +# tuner$optimize(target) +``` + +```{r} +param_set <- ParamSet$new(list( + ParamDbl$new("anova.filter.frac", lower = 0.3, upper = 0.3), + ParamFct$new("filtbranch.selection", levels = c("nothing")), + ParamInt$new("classif.ranger.mtry", lower = 1L, upper = 10L), + ParamFct$new("branch.selection", levels = c("rf")) +)) + +target <- ti( + task = landslide_task, + learner = as_learner(pipe), + resampling = rsmp("spcv_coords", folds = 3), + measures = msr("classif.auc"), + terminator = trm("none"), + search_space = param_set + +) + +tuner <- tnr("grid_search", resolution = 10, batch_size = 100) + +# tuner$optimize(target) +``` + +This tuning process takes an average of 2 minutes \* number of evals to run. I have run it twice so far, and save the results in data files to minimize the number of times we want to re-run the code. + +Overall, the results have shown AUC values between 0.82 and 0.86 for all configurations of models. The logistic regression model with the forward feature selection performed significantly better than the logistic regression without feature selection, and slightly (though not significantly) better than the random forest models. However, logistic regression is much faster to train, especially when using a forward feature selection algorithm that involves training so many models. + +```{r} +# load("/Users/julialober/Documents/terrainworks/code/sandbox/data/3.13.pipelinetuneresult.Rdata") +load("/Users/julialober/Documents/terrainworks/code/sandbox/data/3.14.pipelinetuneresult.Rdata") +# load("/Users/julialober/Documents/terrainworks/code/sandbox/data/3.14.rf_fs_mtry.Rdata") +``` + +```{r} +#| echo: false +to_plot <- cbind(as.data.frame(target_archive$classif.auc), (as.data.frame(target_archive$branch.selection))) +names(to_plot) <- c("auc", "model_type") +to_plot$model_type <- as.factor(to_plot$model_type) + +ggplot(to_plot, aes(x = model_type, y = auc)) + + geom_boxplot(aes(col = model_type)) + + geom_jitter(aes(col = model_type), position = position_jitter(0.2)) +``` + +## Logistic regression model + +Based on the results above, we will work with a logistic regression model and use forward feature selection to select the subset of features that provide the most amount of information. + +Our pipeline includes a scaling step, to even out the influence of each predictor, and an encoding step, to translate the geology predictor from a factor to a one-hot strategy that is treated more logically by a numerical model. + +```{r} +logreg_learner <- lrn("classif.log_reg", + predict_type = "prob") + +spat_resamp <- rsmp("repeated_spcv_coords", + folds = 5, + repeats = 2) + +logreg_fselect <- auto_fselector( + fs("sequential"), + learner = logreg_learner, + resampling = spat_resamp, + measure = msr("classif.bbrier")) + +landslide_task_noage <- as_task_classif_st(x = subset(sample_pnts, select = -c(age, dist_to_road)), + id = "landslide_initiation", + target = "class", + positive = "pos", + coordinate_names = c("x", "y"), + crs = "epsg:26910") + + +pl_fselect <- + po("scale") %>>% + po("encode") %>>% + po("learner", logreg_fselect) + +pl_fselect$train(landslide_task_noage) +#pl_lrn$graph$pipeops$classif.log_reg.fselector$learner + +``` + +Now that we know the best set of features, we can train a model with the resampling method to get a more exact estimate of how well the final model will actually perform. To do this, we use a k-means clustering based spatial resampling method, with 8 folds (7 for training, 1 for testing) and 50 repeats, to get a measure of performance that we can be relatively confident in. + +Then, we look at the aggregate AUC measure for all 50\*8 models that were trained (400 models total). + +```{r} +feature_set <- c("age", "dist_to_road", "mean_curv", "pca_k1_48") +# feature_set <- c("mean_curv", "pca_k1_48") +landslide_task$select(feature_set) + +pl_feature_eng <- + po("scale") %>>% + po("encode") + +pl_feature_eng$train(landslide_task) +landslide_task_eng <- pl_feature_eng$predict(landslide_task)$encode.output + +model <- mlr3::resample( + task = landslide_task_eng, + learner = logreg_learner, + resampling = rsmp("repeated_spcv_coords", + folds = 8, + repeats = 50) +) + +model$aggregate(msr("classif.auc")) + +``` + +Our final step is to train a model using all of the data. While this is generally frowned upon in machine learning, our inventory produces a relatively small data set for this project and we want to leverage all of the data available. + +TODO: look up coefficient interpretation. Does a bigger coefficient mean that the variable has a bigger influence? I think yes, if the data are scaled first. + +Now that I am using the scaled version of the data, I agree. The pca value has the largest coefficient, and is consistently the variable that is chosen first during the feature selection, indicating that it is a highly explanatory variable for our dataset. + +```{r} +logreg_learner$train(landslide_task_eng) +logreg_learner$model +``` diff --git a/analysis/feature_selection.qmd b/analysis/feature_selection.qmd new file mode 100644 index 0000000..8bbe04a --- /dev/null +++ b/analysis/feature_selection.qmd @@ -0,0 +1,228 @@ +--- +title: "Landslide initiation feature selection" +author: Julia Lober +format: html +editor: visual +--- + +```{r load, include = FALSE} +library(mlr3) +library(mlr3spatial) +library(mlr3spatiotempcv) +library(mlr3learners) +library(mlr3tuning) +library(mlr3viz) +library(mlr3fselect) +library(mlr3filters) +library(mlr3verse) +``` + +## Feature selection for landslide points + +This document explores the impact of each predictor on a logistic regression model. We will look at a couple of different ways to do feature selection, and see if they offer consistent results or different options each time. + +```{r} +source("../R/create_model_dataframe.R") +sample_pnts <- training_data[!(is.na(training_data$dist_to_road)), 2:length(training_data)] +``` + +```{r} +names(sample_pnts) +``` + +```{r} +task_orig <- as_task_classif_st(x = sample_pnts, + id = "landslide_initiation", + target = "class", + positive = "pos", + coordinate_names = c("x", "y"), + crs = "epsg:26910") +``` + +We'll also look at adding a few manual predictors that might help us develop the relationship that we observe, or expect to observe based on our understanding of the physical processes. + +```{r} +sample_pnts_ext <- sample_pnts +sample_pnts_ext$gradient_sq <- sample_pnts$gradient ^ 2 +sample_pnts_ext$grad_by_pca <- sample_pnts$gradient * sample_pnts$pca_k1_48 + +task_ext <- as_task_classif_st(x = sample_pnts_ext, + id = "landslide_initiation_ext", + target = "class", + positive = "pos", + coordinate_names = c("x", "y"), + crs = "epsg:26910") +``` + +## Filtering + +Lets look at a few filtering methods. These calculate some measure of correlation between the target variable and the predictors, then assigns a threshold which must be met in order to be included in the final model. + +The benefit of these methods are a simpler method that does not involve building a bunch of models to test the impact of different features. The downside is that it is possible that the filter does not truly reflect the impact on the final model. + +```{r} +task <- task_ext +``` + +```{r} +info_filter = flt("information_gain") +info_filter$calculate(task) + +res_info <- as.data.table(info_filter) +res_info$rank <- 1:length(res_info$feature) +``` + +```{r} +anova_filter = flt("anova") +anova_filter$calculate(task) + +res_anova <- as.data.table(anova_filter) +res_anova$rank <- 1:length(res_anova$feature) +``` + +```{r} +auc_filter = flt("auc") +auc_filter$calculate(task) + +res_auc <- as.data.table(auc_filter) +res_auc$rank <- 1:length(res_auc$feature) +``` + +```{r} +cmim_filter = flt("cmim") +cmim_filter$calculate(task) + +res_cmim <- as.data.table(cmim_filter) +res_cmim$rank <- 1:length(res_cmim$feature) +``` + +```{r} +mim_filter = flt("mim") +mim_filter$calculate(task) + +res_mim <- as.data.table(mim_filter) +res_mim$rank <- 1:length(res_mim$feature) +``` + +#### Compare the filters. + +```{r} +results <- merge(res_auc, res_anova, by = "feature", suffixes = c(".auc", ".anova")) +results <- merge(results, res_info, by = "feature", suffixes = c("", ".info")) +results <- merge(results, res_cmim, by = "feature", suffixes = c("", ".cmim")) +results <- merge(results, res_mim, by = "feature", suffixes = c("", ".mim")) +results$total <- rowSums(cbind(results$rank.auc, results$rank.anova, results$rank, results$rank.cmim, results$rank.mim)) + +barplot(height = results$total, names = results$feature, las = 2) + +``` + +## Basic forward feature selection + +This method tests the model created adding one feature each time until the model performance is no longer improving. We can use a variety of + +```{r} +lgr::get_logger("mlr3")$set_threshold("warn") +set.seed(12345) +instance = fselect( + fselector = fs("sequential"), + task = task, + learner = lrn("classif.log_reg", predict_type = "prob"), + resampling = rsmp("repeated_spcv_coords", + folds = 5, + repeats = 3), + measure = msr("classif.auc") +) +``` + +```{r} +dt <- as.data.table(instance$archive) +dt[batch_nr == 1, 1:14] + +``` + +The result we have here is likely a light over-estimation of modeling performance. The best way to assess performance is to do nested resampling - an outer resampling strategy that reserves a test set for estimating model performance, and an inner resampling strategy that is used for the feature selection process (or, hyperparameter tuning). I do implement this below. + +```{r} +instance$result +instance$result_feature_set +``` + +We can also use the feature selector as an auto_fselector, which inherits from the learner class and can be treated as such. This allows us to put it in a benchmark function and compare it to other learners. + +```{r} +fs_learner = auto_fselector( + fselector = fs("sequential"), + learner = lrn("classif.log_reg", predict_type = "prob"), + resampling = rsmp("repeated_spcv_coords", + folds = 5, + repeats = 2), + measure = msr("classif.auc") +) + +# Tried, did not improve training time (by much) and performed slightly worse. +fs_learner_rand = auto_fselector( + fselector = fs("random_search"), + learner = lrn("classif.log_reg", predict_type = "prob"), + resampling = rsmp("repeated_spcv_coords", + folds = 5, + repeats = 2), + measure = msr("classif.auc") +) +``` + +```{r} +lgr::get_logger("mlr3")$set_threshold("error") +bm_grid <- benchmark_grid( + task = task, + learner = list(fs_learner, lrn("classif.log_reg", predict_type = "prob")), + resampling = rsmp("spcv_coords", folds = 3) +) + +bm <- benchmark(bm_grid) +``` + +It seems that there is not really any significant improvement of the feature-selected model over the normal model. + +```{r} +aggr = bm$aggregate(msrs(c("classif.auc", "time_train"))) +as.data.frame(aggr)[, c("learner_id", "classif.auc", "time_train")] +autoplot(bm, measure = msr("classif.auc")) +``` + +```{r} +rf_learner = auto_fselector( + fselector = fs("sequential"), + learner = lrn("classif.ranger", predict_type = "prob"), + resampling = rsmp("repeated_spcv_coords", + folds = 5, + repeats = 2), + measure = msr("classif.auc") +) + +outer_resample <- rsmp("repeated_spcv_coords", folds = 5, repeats = 5) +rr <- resample(task, rf_learner, outer_resample, store_models = TRUE) +``` + +```{r} +rr2 <- resample(task, lrn("classif.ranger", predict_type = "prob"), outer_resample, store_models = TRUE) + +rr2$score(measures = msr("classif.auc"))[, "classif.auc"] +``` + +Lets try using the Aikake Information Criterion for the forward feature selection. + +ERROR: this always results in an error that says there is no model stored in classif.log_reg(). I don't think that it is worth the time to try to figure this error out. + +```{r} +# lgr::get_logger("mlr3")$set_threshold("debug") +# aic_select = fselect( +# fselector = fs("sequential"), +# task = task, +# learner = lrn("classif.log_reg", predict_type = "prob"), +# resampling = rsmp("repeated_spcv_coords", +# folds = 3, +# repeats = 1), +# measure = msr("aic") +# ) +``` diff --git a/analysis/mlr3_workflow.qmd b/analysis/mlr3_workflow.qmd new file mode 100644 index 0000000..ac7afa9 --- /dev/null +++ b/analysis/mlr3_workflow.qmd @@ -0,0 +1,200 @@ +--- +title: "mlr3 Workflow" +author: "Julia Lober" +format: html +editor: visual +params: + data_dir: + label: Data Directory + value: /Users/julialober/Documents/terrainworks/code/sandbox/data/ + input: text + training_data: + label: Training data .Rdata file + value: sample_wilson.Rdata + input: text + points: + label: Training data points + value: DeanCr/DeanCr_Initiation_Points.shp + input: text + dem: + label: Training data dem + value: DeanCr/elev_deancr.flt + input: text +--- + +```{r setup} +#| echo: false +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + error = FALSE +) +``` + +```{r} +#| echo: false +# load(paste0(params$data_dir, params$training_data)) +# pnts <- vect(paste0(params$data_dir, params$points)) +# dem <- rast(paste0(params$data_dir, params$dem)) +source("../R/create_model_dataframe.R") +sample_pnts <- training_data +``` + +## Workflow for the mlr3 package + +This is a document where I work out how to use the `mlr3` package. This package ecosystem is more modern and agile than the `caret` package, although because it is newer it has a smaller user base and a little bit less online resources. + +```{r load, include = FALSE} +library(mlr3) +library(mlr3spatial) +library(mlr3spatiotempcv) +library(mlr3learners) +library(mlr3tuning) +library(mlr3viz) + +# Packages for model-building +library(glmnet) +library(ranger) +``` + +Major points for the `mlr3` package include the different nomenclature. The data used to train a model is referred to a `task` and the model being trained is referred to as a `learner`. + +This is a general workflow for any type of model. Another major difference between `mlr3` and `caret` is the names of the model types. The `mlr3learners` package provides the following model types. + +```{r} +mlr_learners +``` + +## Step 1. Creating a task and a learner + +Our task leaves out the `id` column in the sample_pnts data frame, as it will not be relevant information for the model! Specifying the target says what we want the learner to predict - for us, this is the `class` which is positive for landslide initiation points. + +```{r} +task <- as_task_classif(x = sample_pnts[, 2:10], id = "landslide_init", target = "class") +head(task) +``` + +The learner is created based on what type of model we want to train. See above for a list of learners that are available (and keep in mind that you could load `mlr3extralearners` for an even bigger selection). We are interested in classification learners, since the input is categorized into two different classes. + +`classif.glmnet` is a linear regression strategy for classification problems. Notably, `lambda` is a hyperparameter that cannot be automatically optimized. A bandaid solution was to specify a value in the creation of the learner, which silences a warning at prediction time. I am currently reading about `mlr3tuning` and how to create a wrapper tuner object that allows manual specifications for hyperparameter optimization (HPO). + +```{r} +learner <- lrn("classif.log_reg", + predict_type = "prob") +print(learner) +``` + +As an experiment, we could train and predict using all of the data. This is bad practice in machine learning but can be an interesting baseline and will help me figure out the syntax. + +```{r} +learner$train(task) +print(learner$model) +``` + +```{r} +predicted <- learner$predict(task) + +``` + +```{r} +tsk_predict = as_task_unsupervised(dem) + +# plan("multisession") # optional parallelization +pred = predict_spatial(tsk_predict, learner, format = "terra") +class(pred) +``` + +Next, we can take a general look at some performance measures. + +```{r} +head(as.data.table(mlr_measures)) +``` + +Print some of these metrics. + +`classif.acc` is the accuracy of the model and `classif.ce` is the error. These should always be inverses of each other. + +`classif.auc` is the area under the ROC curve. Since we are interested more in the probability of a yes (landslide initiation) than simply classification into binary classes, this measure is a bit more useful. + +```{r} +scores <- predicted$score(msrs(c("classif.acc", "classif.ce", "classif.auc"))) +print(scores) + +conf_matrix <- predicted$confusion +print(conf_matrix) +``` + +Resampling: + +```{r} +resampling <- rsmp("cv", folds = 5) + +learner_2 <- lrn("classif.ranger", + predict_type = "prob") +learner_3 <- lrn("classif.featureless", + predict_type = "prob") + +bench <- benchmark_grid(task = task, + learner = c(learner, learner_2, learner_3), + resampling = resampling) + +mark <- benchmark(bench) + +``` + +```{r} +autoplot(mark, measure = msr("classif.acc")) +``` + +```{r} +autoplot(mark, type = "roc") +``` + +## Step 2. Tune and resample + +In order to train this model with some hyperparameter tuning, we need to go back and specify which hyperparameters need to be tuned in the learner. + +```{r} +learner <- lrn("classif.glmnet", + alpha = to_tune(1e-5, 1), + lambda = 0.1, + # s = to_tune(1e-5, 1e5), + predict_type = "prob") + +tuner = tnr("grid_search", resolution = 5, batch_size = 5) + +resampling <- rsmp("cv", folds = 5) + +learner_at = auto_tuner( + method = tnr("grid_search", resolution = 5, batch_size = 5), + learner = learner, + resampling = resampling, + measure = msr("classif.auc"), +) +print(learner_at) +``` + +```{r} +learner_at$train(task) +``` + +```{r} +predicted <- learner_at$predict(task) +print(predicted$score(msrs(c("classif.auc", "classif.acc")))) +print(predicted$confusion) +``` + +## Step 3. Comparing spatiotemporal cross-validation vs normal cross-validation. + +For any machine learning workflow, you want to including a resampling method that will train models on a couple different subsets of data as training and testing sets to reduce the role of chance and produce a more robust basis for evaluating the model's performance. + +I need data with x and y coordinates attached to do this. This is currently my standstill. + +```{r} +plot(dem) +points(pnts) +``` + +```{r} + +``` diff --git a/analysis/pca_training_data.Rmd b/analysis/pca_training_data.Rmd new file mode 100644 index 0000000..cad7011 --- /dev/null +++ b/analysis/pca_training_data.Rmd @@ -0,0 +1,76 @@ +--- +title: "Principal component analysis on landslide training data" +author: Julia Lober +output: + html_notebook: + df_print: paged + html_document: + toc: true +params: + data_dir: + label: Data Directory + value: //Mac/Home/Documents/terrainworks/code/sandbox/data/ + input: text + training_data: + label: Training data .Rdata file + value: /sample_umpqua.Rdata + input: text +editor_options: + chunk_output_type: inline +--- + + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + error = FALSE +) +``` + +# Principal component analysis +A principal component analysis (PCA) allows us to reduce the dimensionality of data and visualize the variation in a dataset. It is usually an exploratory analysis rather than a key part of a quantitative analysis. Here, we do it to see where different covariates might be correlated and how the initiation points vs. non-inititation points are reflected in the overall variability. + +```{r} + +load(paste0(params$data_dir, params$training_data)) + +pc <- prcomp(sample_pnts[,2:7], + center = TRUE, + scale. = TRUE) + +p <- plot(pc$x[sample_pnts$class == "neg",], pch = 19, + main = "Principal Components Analysis", font.main = 4) +points(pc$x[sample_pnts$class == "pos",], col = "red", pch = 19) + +legend("topright", + legend = c("Negative points", "Landslide initiation points"), + pch = 19, + col = c("black", "red"), + bty= "n") + +l.x <- cbind(pc$rotation[,1]) * 2 +l.y <- cbind(pc$rotation[,2]) * 2 +arrows(x0=0, x1=l.x, y0=0, y1=l.y, col="darkgreen", length=0.15, lwd=3) +text(l.x, l.y, labels=rownames(l.x) , col="darkgreen", pos=c(3, 1, 3, 1), offset=1, cex=1.2) + + + +``` + +The first two principal components are the ones that explain the most amount of variability in the data. This should be the axis across which we see the data as separated as it gets. We hope to see some separation of the different categories we are interested (in this case, landslide initiation points and negative points), indicating that the covariates we are looking at can explain the differences between these classes. + +For the example Umpqua dataset, there is a slight trend toward landslide initiation points being on the left side of the plot, but lots of overlap/noise in the plots still. + +```{r} +preset <- par(mfrow = c(1, 2)) + +barplot(pc$rotation[,1], main="PC 1 Loadings Plot", las=2, + ylim = c(-1, 1)) +barplot(pc$rotation[,2], main="PC 2 Loadings Plot", las=2, + ylim = c(-1, 1)) + +par(preset) + +``` +Looking at the loadings of each covariate on the principal components, we can try to grasp what each of the dimensions is trying to capture. For example, the first principal component is mostly from the partial contributing area calculations, while the second component is by far most affected by gradient. diff --git a/analysis/prcomp_training_data.qmd b/analysis/prcomp_training_data.qmd new file mode 100644 index 0000000..26ef90e --- /dev/null +++ b/analysis/prcomp_training_data.qmd @@ -0,0 +1,117 @@ +--- +title: "Principal component analysis on training data" +author: "Julia Lober" +format: html +editor: visual +params: + data_dir: + label: Data Directory + value: /Users/julialober/Documents/terrainworks/code/sandbox/data/ + input: text + training_data: + label: Training data .Rdata file + value: sample_umpqua.Rdata + input: text + points: + label: Training data points + value: DeanCr/DeanCr_Initiation_Points.shp + input: text + dem: + label: Training data dem + value: DeanCr/elev_deancr.flt + input: text +--- + +```{r setup} +#| echo: false +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + error = FALSE +) +``` + +```{r load, include = FALSE} +library(terra) +``` + +```{r} +#| echo: false +load(paste0(params$data_dir, params$training_data)) +pnts <- vect(paste0(params$data_dir, params$points)) +dem <- rast(paste0(params$data_dir, params$dem)) +``` + +## Looking at the data: + +Change the parameters in the .qmd document to use your own dataset for this markdown document. Right now, we are looking at the Dean Creek area of the Umpqua Basin. The covariates for model building are extracted from a LIDAR digital elevation model (DEM) using FORTRAN scripts. The points are from the DOGAMI landslide inventory. + +```{r} +#| echo: false +plot(dem) +points(pnts) +``` + +## Training Data + +A sampling strategy for selecting negative points gives us a data frame of training data with a random selection of non-landslide initiation points based on a buffer. There is a full description of the negative sampling strategy in a different document in this package. + +There are 8 columns that represent the covariates used for building the model. Gradient and mean curvature are the two physical parameters chosen based on the condition needed for landslide initiation. There are also 4 partial contributing area calculations for storms of different duration (6hr, 12hr, 24hr, and 48hr). + +```{r} +head(sample_pnts) +``` + +## Principal component analysis (PCA) + +I am using the built-in `prcomp()` function from the + +```{r} +pc <- prcomp(sample_pnts[,2:7], + center = TRUE, + scale. = TRUE) +``` + +Lets look at the results of the analysis. We can graph the data on the axis of the first two principal components. These are linear combinations of the covariates that we provide that should explain variability in the data. Visually, we should see the data as separated as it gets. We can graph the landslide initiation points and negative points in different colors to see whether these principal components can point to differences between our two classes. + +```{r} +#| echo: false +old_par <- par(xpd = TRUE, mar = c(2, 2, 2, 8)) + +p <- plot(pc$x[sample_pnts$class == "neg",], + pch = 20, + main = "Principal Components Analysis", + font.main = 2, + xlim = c(-6, 6), + ylim = c(-4, 4), + xlab = "Principal component 1", + ylab = "Principal component 2") +points(pc$x[sample_pnts$class == "pos",], col = "red", pch = 20) + +legend("topright", + inset = c(-.3, 0), + legend = c("Negative points", "Landslide \ninitiation points"), + pch = 20, + col = c("black", "red"), + bty= "n") + +l.x <- cbind(pc$rotation[,1]) * 2 +l.y <- cbind(pc$rotation[,2]) * 2 + +par(old_par) +``` + +We can look at the contributions of each covariate to each of the principal components. We see that the first principal component is a fairly equal combination of partial contributing areas and the second principal component is mostly contributed to by gradient. + +```{r} +preset <- par(mfrow = c(1, 2)) + +barplot(pc$rotation[,1], main="PC 1 Loadings Plot", las=2, + ylim = c(-1, 1)) +barplot(pc$rotation[,2], main="PC 2 Loadings Plot", las=2, + ylim = c(-1, 1)) + +par(preset) +# text(l.x, l.y, labels=rownames(l.x) , col="darkgreen", pos=c(3, 1, 3, 1), offset=1, cex=1.2) +# plot(arrows(x0=0, x1=l.x, y0=0, y1=l.y, col="darkgreen", length=0.15, lwd=3)) +``` diff --git a/analysis/predicting_landslide_initiation.qmd b/analysis/predicting_landslide_initiation.qmd new file mode 100644 index 0000000..d45aaad --- /dev/null +++ b/analysis/predicting_landslide_initiation.qmd @@ -0,0 +1,534 @@ +--- +title: "Predicting landslide initiation" +author: Julia Lober +format: html +editor: visual +--- + +```{r load, include = FALSE} +# for spatial data +library(terra) + +# for modeling +library(mlr3) +library(mlr3spatial) +library(mlr3spatiotempcv) +library(mlr3pipelines) +library(mlr3learners) + +# for visualizations +library(ggplot2) +library(tidyterra) +library(viridis) + +# basics +library(data.table) +library(tictoc) +library(TerrainWorksUtils) +``` + +## Predicting landslide initiation + +We start by predicting on 1 small basin as part of 1 of the watershed designations of DOGAMI in Oregon. Find the data wherever it is located on your computer. + +```{r} +dir <- "//Mac/Home/Documents/terrainworks/code/sandbox/data/downloaded_3.6/in/" +# dir <- "/Users/julialober/Documents/terrainworks/code/sandbox/data/downloaded_3.6/in/" +dem <- terra::rast(paste0(dir, "Umpqua/basin_1.flt")) +pts <- terra::vect(paste0(dir, "all_initiation_points.shp")) + +pts_local <- crop(pts, dem) + +plot(dem) +plot(pts_local, + add = TRUE) +``` + +### Calculating the predictors + +The predictors chosen by our model are: + +1. Partial contributing area with a hydraulic conductivity of 1 and storm duration of 48 hours. + +2. Mean curvature + +3. Distance to road + +4. Stand age + +We need to calculate the partial contributing area and mean curvature using the executables included in this package. There is also a function for calculating the distance to a road, and the stand age can be collected from available data. + +The model is trained on all of the available inventory points and stored in "logreg_learner". This happens in the comparing_landslide_models.qmd file. + +```{r} +calculate <- TRUE + +scratch <- "//Mac/Home/Documents/terrainworks/code/sandbox/data/scratch/" +# scratch <- "/Users/julialober/Documents/terrainworks/code/sandbox/data/scratch/" + +rasters <- c(paste0("GRADIENT,", scratch, "gradient"), + paste0("MEAN CURVATURE,", scratch, "mean_curv")) + +length_scale <- 15 + +if (calculate) { + rasters <- elev_deriv(rasters = rasters, + length_scale = length_scale, + dem = paste0(dir, "Umpqua/basin_1.flt"), + scratch_dir = scratch) +} +``` + +Now, calculate partial contributing area. The 48-hour duration was the best across all of the model investigating, so we only need to calculate that one. + +```{r} +if (calculate) { + rasters <- c(rasters, contributing_area(raster = paste0(scratch, "pca_k1_d48"), + dem = paste0(dir, "Umpqua/basin_1.flt"), + length_scale = 15, + k = 1, + d = 48, + scratch_dir = scratch)) +} + +``` + +Now that the scripts have been run, we can assemble the data from files (and never run the 2 hour script again). + +```{r} +scratch <- "/Users/julialober/Documents/terrainworks/code/sandbox/data/scratch/" + +topo_files <- c(paste0("GRADIENT,", scratch, "gradient.flt"), + paste0("MEAN CURVATURE,", scratch, "mean_curv.flt")) + +topo_rast <- elev_deriv(rasters = topo_files) +pca_rast <- contributing_area(raster = paste0(scratch, "pca_k1_d48.flt")) + +rasters <- c(topo_rast, pca_rast) +``` + +### Success rate curves + +To evaluate the model using proportions, we use a success rate curve. The curve has the proportion of the total area on the x-axis, with the proportion of landslides on the y-axis. To generate this curve, we train a model on the sample points, then predict the entire DEM. The sampling to generate training points and model decisions have been made in a different document: we use a logistic regression learner trained on the mean curvature, log value of partial contributing area with a duration of 48 hours, and gradient. + +```{r} +folder <- "/Users/julialober/Documents/terrainworks/code/sandbox/data/downloaded_3.6/out/" +load(paste0(folder, "ls_1996.Rdata")) +load(paste0(folder, "ls_2007.Rdata")) +load(paste0(folder, "ls_2011.Rdata")) +load(paste0(folder, "nonls_1996.Rdata")) +load(paste0(folder, "nonls_2007.Rdata")) +load(paste0(folder, "nonls_2011.Rdata")) + +training_data <- rbind(ls_1996, + ls_2007, + ls_2011, + nonls_1996, + nonls_2007, + nonls_2011) + +train_subset <- subset(training_data, select = c("gradient", "mean_curv", "pca_k1_48", "class", "x", "y")) +train_subset$pca_k1_48 <- log(train_subset$pca_k1_48, base = 2) +train_subset$class <- as.factor(train_subset$class) +``` + +```{r} +set.seed(12345) +# train_subset <- subset(train_subset, select = c("mean_curv", "pca_k1_48", "class", "x", "y")) +landslide_task <- as_task_classif_st(x = train_subset, + id = "landslide_initiation", + target = "class", + positive = "pos", + coordinate_names = c("x", "y"), + crs = "epsg:26910") + +pl_feature_eng <- + po("scale") %>>% + po("encode") + +pl_feature_eng$train(landslide_task) +landslide_task_eng <- pl_feature_eng$predict(landslide_task)$encode.output + +logreg_learner <- lrn("classif.log_reg", + predict_type = "prob") + +logreg_learner$train(landslide_task_eng) +logreg_learner$model +``` + +As a brief sanity check, we will compare a version of the model that sub-samples the negative points to create an even balance between the two classes as input. I am curious to see how these compare in a final output model. + +```{r} +source("../R/create_model_dataframe.R") +training_data <- training_data[!(is.na(training_data$dist_to_road)), 2:length(training_data)] + +landslide_task_subsamp <- as_task_classif_st(x = subset(training_data, select = c("gradient", "mean_curv", "pca_k1_48", "class", "x", "y")), + id = "landslide_initiation", + target = "class", + positive = "pos", + coordinate_names = c("x", "y"), + crs = "epsg:26910") + +pl_feature_eng$train(landslide_task_subsamp) +task_subsamp <- pl_feature_eng$predict(landslide_task_subsamp)$encode.output + +logreg_learner2 <- lrn("classif.log_reg", + predict_type = "prob") + +logreg_learner2$train(task_subsamp) +logreg_learner2$model +``` + +Now, we are ready to predict the basin area. We need to extract the all the data points from the DEM and make sure the names match what is expected by the model (gradient, mean_curv, and pca_k1_48). + +```{r} +names(rasters) <- c("gradient", "mean_curv", "pca_k1_48") +predicting_data <- as.data.frame(rasters, xy = TRUE) +``` + +The mlr3 package support spatial prediction with the predict_spatial function. This function accepts new data in raster format and outputs a raster, but I can't figure out if the function will support a probability output instead of a binary class output. + +```{r} +predictions <- predict_spatial(rasters, logreg_learner) +``` + +```{r} +plot(predictions) +``` + +Note that this classifies most of the area as positive while only the river areas and hilltops are marked as negative. This indicates that a low threshold was chosen for class assignment when we compare to the probability map produced below. + +For now, I'll try predicting on a data frame. The downside of this strategy is the extra time needed to convert the DEM to a data frame, and the lack of space optimization that the spatial_predict function allows. + +```{r} +pd <- predicting_data +pd$pca_k1_48 <- log(predicting_data$pca_k1_48, base = 2) + +# manually pre process the data: take the log of the pca and center+scale the data +center_vals <- pl_feature_eng$pipeops$scale$state$center +scale_vals <- pl_feature_eng$pipeops$scale$state$scale + +pd$gradient <- (pd$gradient - center_vals[1]) / scale_vals[1] +pd$mean_curv <- (pd$mean_curv - center_vals[2]) / scale_vals[2] +pd$pca_k1_48 <- (pd$pca_k1_48 - center_vals[3]) / scale_vals[3] + +# predict the basin using scaled and centered data. +predictions_df <- logreg_learner$predict_newdata(pd) +``` + +Convert the predictions outputted to a data table, then add the columns with useful information like the predictors and (x,y) coordinates. + +```{r} +predictions_dt <- as.data.table(predictions_df) +predictions_dt <- cbind(predictions_dt, predicting_data) +names(predictions_dt) +``` + +We also convert it to a raster format for future use. + +```{r} +prob_rast <- (rast(subset(predictions_dt, select = c("x", "y", "prob.pos")))) +``` + +```{r} +ggplot(predictions_dt) + + geom_tile(aes(x = x, y = y, fill = prob.pos)) + + scale_fill_viridis(option = "A", direction = -1) + + geom_spatvector(data = pts_local) +``` + +### Success rate curve + +Our goal when looking at model performance will be to consider an analysis using proportions. + +To calculate the success rate curve, we first calculate the proportion of the area that falls up to a given probability value and the proportion of the total probability that falls up to a given probability value for each DEM cell. The steps for this calculation will be spelled out in the code below. + +```{r} +proportions <- subset(predictions_dt, select = c("x", "y", "prob.pos")) +setorder(proportions, cols = "prob.pos") + +# calculate the proportion of total landslide probability for each DEM cell +proportions$prob.cumsum <- cumsum(proportions$prob.pos) +prob_sum <- max(proportions$prob.cumsum) +proportions$prob.prop <- proportions$prob.cumsum / prob_sum + +# calculate the proportion of total area for each DEM cell +proportions$area.cumsum <- seq(1, length(proportions$x)) +area_sum <- max(proportions$area.cumsum) +proportions$area.prop <- proportions$area.cumsum / area_sum + +setattr(proportions, "sorted", "prob.pos") +``` + +First, I calculated cumulative sums for each row. Now, we define some much larger interval for binning this data, which will allow much easier plotting. Between 0 and the maximum value for probability, we look at 100 values of the predicted probability. We extract the nearest row for each of these values from the proportions data table, and now have all of the information we need for plotting. + +```{r} +bins <- as.data.table(seq(0, max(proportions$prob.pos), length.out = 500)) +names(bins) <- c("probability") + +# match these probability bins with nearest rows in the proportions data table +binned_prop <- proportions[J(bins$probability), roll = "nearest"] +``` + +That one line of code should have created a new data frame with 100 rows and the nearest row from each of the cumulative sums above. Now we need the observed landslide data. + +```{r} +# extract the probability values for the test points +pts_prob <- extract(prob_rast, pts_local, xy = TRUE) +pts_prob <- pts_prob[!is.na(pts_prob$prob.pos), ] +setorder(pts_prob, cols = "prob.pos") + +total_pts <- length(pts_prob$ID) + +# get the cumulative area (already calculated) and calculate the cumulative landslides +pts_cumul <- proportions[J(pts_prob$prob.pos), roll = "nearest"] +pts_cumul$ls.prop = seq(1, length(pts_cumul$x)) / total_pts + +# add the origin and a point at (1, 1) to make the graphs prettier +pts_cumul <- rbind(0, pts_cumul, fill = TRUE) +pts_cumul[1] <- 0 +pts_cumul <- rbind(pts_cumul, 0, fill = TRUE) +pts_cumul[length(pts_cumul$x)] <- 1 +``` + +The first plot that we look at shows the cumulative proportions of area, modeled landslides, and observed landslides at each probability mark. Ideally, we would want the modeled landslide curve to match up with the observed landslide curve. Since we are looking at a single basin right now, there is probably not enough information to really evaluate this correlation. + +```{r} +ggplot(cbind(bins, binned_prop)) + + geom_line(aes(x = probability, y = prob.prop), color = "red") + + geom_point(aes(x = probability, y = prob.prop), color = "red") + + geom_line(aes(x = probability, y = area.prop), color = "lightblue") + + geom_point(aes(x = probability, y = area.prop), color = "lightblue") + + geom_line(data = pts_cumul, aes(x = prob.pos, y = ls.prop)) + + geom_point(data = pts_cumul, aes(x = prob.pos, y = ls.prop)) +``` + +The main curve that we want to use to compare models is the success rate curve, or the proportion of the modeled landslides plotted against the proportion of area. Each point along the curve comes from one of the probability bins. One thing that this curve tells us is how specific the model is with the areas that it feels are landslide-prone. + +```{r} +ggplot(cbind(bins, binned_prop)) + + geom_line(aes(x = area.prop, y = prob.prop), color = "red") + + geom_point(aes(x = area.prop, y = prob.prop), color = "red") + + geom_line(data = pts_cumul, aes(x = area.prop, y = ls.prop), color = "black") + + geom_point(data = pts_cumul, aes(x = area.prop, y = ls.prop), color = "black") +``` + +## Combining multiple basins + +Really, we want to be able to generate this curve over many different basins. We start this process by assuming that you have already generated all of the elevation derivative and partial contributing area files that are needed for the model to predict on the basin (and that you know where the files are). This part of the program will use that information to predict the probability of initiation for each basin, then combines these into one success rate curve. + +Since each basin will have millions of DEM cells, we have to sub-sample the cumulative values down to a number of data points that is reasonable to manage. We already do this for plotting; for combining multiple DEMs, we will just need to make sure to use consistent probability values for sub-sampling, and then the cumulative values will be averaged based on how many DEM cells the cumulative proportion was calculated from. TODO: figure out the exact formulas for calculating these weighted averages. + +Updated average = ( (Past average \* past \# cells) + new value ) / new cell total + +OR -- we just sum the cumulative sums and the total number of cells/cumulative probability and then take the averages at the end, once the totals can actually be calculated. This is probably the simpler option and I will go this route unless for some reason this ends up not working. + +Finding the observed landslide information will probably be easier - we will want to keep track of which points are located within each DEM, mark those points, and then the final step will be the same calculations as above. Since there is a maximum of about 400 points, we don't need to worry about data size as much. + +I think this will be one large for loop that iterates once per basin. I may break it out into a function, or maybe break out the inside of the for loop into a function? I'll start without the function and change it later if needed, though. + +```{r} +# replace these with the respective file paths to your DEMs +dem_dir <- "/Users/julialober/Documents/terrainworks/code/sandbox/data/downloaded_3.6/in/" +dir = "/Users/julialober/Documents/terrainworks/code/sandbox/data/predicting_input/" +# dir <- "//Mac/Home/Documents/terrainworks/code/sandbox/data/predicting_input/" +# dem_dir <- "//Mac/Home/Documents/terrainworks/code/sandbox/data/downloaded_3.6/in/" + +calculate <- FALSE + +basin_list <- list.files(dem_dir, paste0("^basin_[0-9]{1,4}\\.flt"), recursive = TRUE) +``` + +#### For calculating elevation derivatives: + +This code only needs to be run once. Eventually, I will remove it from this document. + +```{r} + +if (calculate) { + for (basin in basin_list) { + + # create directories for basins and sub-basins if needed. + # if (!dir.exists(paste0(dir, sub("/.*","", basin)))) { + dir.create(paste0(dir, sub("/.*","", basin))) + # } + # if (!dir.exists(paste0(dir, sub("\\..*","", basin)))) { + dir.create(paste0(dir, sub("\\..*","", basin))) + # } + + # define the elevation derivatives that we want to be calculated + rasters <- c(paste0("GRADIENT,", dir, sub("\\..*","", basin), "/gradient"), + paste0("MEAN CURVATURE,", dir, sub("\\..*","", basin), "/mean_curv")) + + print(rasters) + + length_scale <- 15 + + rasters <- elev_deriv(rasters = rasters, + length_scale = length_scale, + dem = paste0(dem_dir, basin), + scratch_dir = paste0(dir, sub("\\..*","", basin))) +} +} + +``` + +Short utility script for copying the partial contributing area files downloaded from sharefile into the folders structure that will be easy for me to use. I have the basins listed in a different order, so I have to inspect the input file for the correct number suffix that will correspond to the correct pca file. + +```{r} +dem_dir <- "/Users/julialober/Documents/terrainworks/code/sandbox/data/downloaded_3.6/in/" +dir <- "/Users/julialober/Documents/terrainworks/code/sandbox/data/predicting_input/" +pca_dir <- "/Users/julialober/Documents/terrainworks/code/sandbox/data/pca/" + +# the pca files are given numbered in the order that they appear in this input file +input_order <- read.delim("/Users/julialober/Documents/input_list.txt") +names(input_order) <- c("list") + +for (basin in basin_list) { + target_dir <- paste0(dir, sub("\\..*","", basin), "/") + + # find the row number where the basin name matches + num <- which(grepl(sub("\\..*","", basin), input_order$list, ignore.case = TRUE)) + + target_files <- list.files(pca_dir, paste0("^pca_6_", num, "\\."), full.names = TRUE) + file.copy(target_files, target_dir) +} +``` + +A loop to figure out which basins are missing from the basin_list. These basins were included in the partial contributing area files that I got from Dan but for some reason are not included in the original input file for generating the training data that I downloaded. It looks like it is the McKenzie basin that is missing. + +```{r} +for (line in input_order$list) { + ans <- regexpr("/[A-Za-z]{5,30}/[A-Za-z]?basin_[0-9]{1,3}", line) + basin <- substr(line, ans[1] + 1, ans[1] + attr(ans, "match.length") - 1) + + # find the row number where the basin name matches + found <- which(grepl(basin, basin_list, ignore.case = TRUE), arr.ind = FALSE) + if (length(found) == 0) { + print(basin) + } + +} +``` + +A note: this code assumes we already have a trained model to use for predictions. The learner we are using here comes from the code above. I am timing each iteration of the loop to get an idea of how long it will take to run. So far, we are hovering around 50 seconds for one basin. + +TODO: figure out how to crop the points and add the observed landslides to a data frame to compare. This was relatively simple above, so shouldn't be too hard to figure out. + +```{r} +dir <- "/Users/julialober/Documents/terrainworks/code/sandbox/data/predicting_input/" + +model <- logreg_learner +bins <- as.data.table(seq(0, max(proportions$prob.pos), length.out = 500)) +names(bins) <- c("probability") + +count <- 1 + +cumul_prop <- data.frame("area" = rep(0, 500), + "prob" = rep(0, 500)) + +pts_prop <- data.frame("x" = 0, + "y" = 0, + "prob.pos" = 0, + "prob.cumsum" = 0, + "area.cumsum" = 0, + "ls.prop" = 0) + +for (basin in basin_list) { + + basin <- sub("\\..*","", basin) + + tic(msg = paste0("loop for ", basin)) + + # find and load the data files + topo_files <- c(paste0("GRADIENT,", dir, basin, "/gradient.flt"), + paste0("MEAN CURVATURE,", dir, basin, "/mean_curv.flt")) + topo_rast <- elev_deriv(rasters = topo_files) + print(basin) + # print(count) + num <- which(grepl(sub("\\..*","", basin), input_order$list, ignore.case = TRUE)) + pca_rast <- contributing_area(raster = paste0(dir, basin, "/pca_6_", num, ".flt")) + count <- count + 1 + rasters <- c(topo_rast, pca_rast) + + names(rasters) <- c("gradient", "mean_curv", "pca_k1_48") + predicting_data <- as.data.frame(rasters, xy = TRUE) + + pd <- predicting_data + pd$pca_k1_48 <- log(predicting_data$pca_k1_48, base = 2) + + # manually pre process the data: take the log of the pca and center+scale the data + center_vals <- pl_feature_eng$pipeops$scale$state$center + scale_vals <- pl_feature_eng$pipeops$scale$state$scale + + pd$gradient <- (pd$gradient - center_vals[1]) / scale_vals[1] + pd$mean_curv <- (pd$mean_curv - center_vals[2]) / scale_vals[2] + pd$pca_k1_48 <- (pd$pca_k1_48 - center_vals[3]) / scale_vals[3] + + # predict the basin using scaled and centered data. + predictions_df <- as.data.table(logreg_learner$predict_newdata(pd)) + predictions_df <- cbind(predictions_df, predicting_data) + + # head(predictions_df) + + proportions <- as.data.table(subset(predictions_df, select = c("x", "y", "prob.pos"))) + setorder(proportions, cols = "prob.pos") + + # calculate the cumulative sums of area and probabilities + proportions$prob.cumsum <- cumsum(proportions$prob.pos) + prob_sum <- max(proportions$prob.cumsum) + proportions$area.cumsum <- seq(1, length(proportions$x)) + area_sum <- max(proportions$area.cumsum) + + setattr(proportions, "sorted", "prob.pos") + + binned_prop <- proportions[J(bins$probability), roll = "nearest"] + cumul_prop$area <- cumul_prop$area + binned_prop$area.cumsum + cumul_prop$prob <- cumul_prop$prob + binned_prop$prob.cumsum + + pts_local <- crop(pts, rasters) + # extract the probability values for the test points + pts_prob <- extract(prob_rast, pts_local, xy = TRUE) + pts_prob <- pts_prob[!is.na(pts_prob$prob.pos), ] + setorder(pts_prob, "prob.pos") + + total_pts <- length((pts_prob$prob.pos)) + + # get the cumulative area (already calculated) and calculate the cumulative landslides + pts_cumul <- proportions[J(pts_prob$prob.pos), roll = "nearest"] + pts_cumul$ls.prop = seq(1, length(pts_cumul$x)) / total_pts + + pts_prop <- rbind(pts_prop, pts_cumul) + + toc() +} + + +total_area <- max(cumul_prop$area) +total_prob <- max(cumul_prop$prob) + +cumul_prop$area_prop <- cumul_prop$area / total_area +cumul_prop$prob_prop <- cumul_prop$prob / total_prob + +dt <- cbind(bins, cumul_prop)[J(pts_prop$prob.pos), roll = "nearest", on = "probability"] +pts_prop$area_prop <- dt$area_prop + +``` + +With basins combined, we see a slightly different shape in the curves. The proportions of area and probability jump up greatly at smaller probabilities. + +```{r} +ggplot(cbind(bins, cumul_prop)) + + geom_line(aes(x = probability, y = prob_prop), color = "red") + + geom_point(aes(x = probability, y = prob_prop), color = "red") + + geom_line(aes(x = probability, y = area_prop), color = "lightblue") + + geom_point(aes(x = probability, y = area_prop), color = "lightblue") + + geom_line(data = pts_prop, aes(x = prob.pos, y = ls.prop)) + + geom_point(data = pts_prop, aes(x = prob.pos, y = ls.prop)) +``` + +```{r} +ggplot(cbind(bins, cumul_prop)) + + geom_line(aes(x = area_prop, y = prob_prop), color = "red") + + geom_point(aes(x = area_prop, y = prob_prop), color = "red") +``` diff --git a/analysis/resampling_methods.qmd b/analysis/resampling_methods.qmd new file mode 100644 index 0000000..ed1ceb4 --- /dev/null +++ b/analysis/resampling_methods.qmd @@ -0,0 +1,230 @@ +--- +title: "Spatial CV vs traditional CV" +author: "Julia Lober" +format: html +editor: visual +params: + data_dir: + label: Data Directory + value: /Users/julialober/Documents/terrainworks/code/sandbox/data/ + input: text + training_data: + label: Training data .Rdata file + value: sample_wilson.Rdata + input: text + points: + label: Training data points + value: DeanCr/DeanCr_Initiation_Points.shp + input: text + dem: + label: Training data dem + value: DeanCr/elev_deancr.flt + input: text +--- + +```{r} +#| echo: false +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + error = FALSE +) +``` + +```{r} +#| echo: false +# load(paste0(params$data_dir, params$training_data)) +# pnts <- terra::vect(paste0(params$data_dir, params$points)) +# dem <- terra::rast(paste0(params$data_dir, params$dem)) +source("../R/create_model_dataframe.R") +sample_pnts <- training_data +``` + +```{r load, include = FALSE} +library(mlr3) +library(mlr3spatial) +library(mlr3spatiotempcv) +library(mlr3learners) +library(mlr3tuning) +library(mlr3viz) +# Packages for models - change these based on what type of model you build +library(glmnet) +library(ranger) +``` + +## Evaluate Spatial CV Methods vs Normal CV + +Data is loaded. First we need to look at the points on a map. + +TODO: figure out how to add a map background to the + +```{r} +# orig_p <- par(mar = c(5, 5, 0, 5)) +points <- as.data.frame(cbind(sample_pnts[, "x"], y = sample_pnts[, "y"])) +points$yr <- sample_pnts$year +points$class <- sample_pnts$class +terra::crs(points) <- "epsg:26910" +points <- project(points, "epsg:4326") + +ggplot() + + geom_polygon(data = oregon, mapping = aes(x = long, y = lat), + color = "white", fill = "white") + + geom_spatvector(data = points[points$class == "pos", ], + mapping = aes(shape = as.factor(points[points$class == "pos", ]$yr)), + show.legend = TRUE, size = 1, color = "red", alpha = 0.5) + + geom_spatvector(data = points[points$class == "neg", ], + mapping = aes(shape = as.factor(points[points$class == "neg", ]$yr)), + show.legend = TRUE, size = 1 , color = "black", alpha = 0.5) + + scale_shape_manual(values = c(3, 1, 17)) + + labs(x = "Longitude", y = "Latitude", shape = "Study points") + +# par(orig_p) + +``` + +We want to train a logistic regression learner based on the landslide data points loaded into the data frame sample_pnts. + +```{r} +task <- as_task_classif_st(x = sample_pnts[!(is.na(sample_pnts$dist_to_road)), 2:14], + id = "landslide_init", + target = "class", + positive = "pos", + coordinate_names = c("x", "y"), + crs = 2992) + +learner <- lrn("classif.log_reg", + predict_type = "prob") +``` + +```{r} +#| echo: false +head(sample_pnts) +``` + +### Non-spatial cross validation + +Non-spatial cross validation randomly selects points to split the training data into training and testing sets. The concern we have with this method is that in some cases, it will end up overestimating the performance of the model or underestimating error. We want to approximate model performance as closely as possible in order to assess how much confidence we can put in it. + +```{r} +set.seed(12345) +rsmp_nonsp <- rsmp("cv", + folds = 5) + +autoplot(rsmp_nonsp, task, fold_id = c(1:5)) +``` + +### K-means clustering spatial cross-validation + +We can start running a cluster-based spatial CV method. This method does not consider the temporal distribution of the points, but uses a k-means clustering algorithm to split the data into the defined number of regions based on their x-y coordinates. This is one of the more straight-forward methods of spatial cross-validation: divide our data into groupings that are all near each other. + +> Nevertheless, despite the random selection of initial cluster centers, repeated partitionings may in some cases be nearly identical. Also, k-means clustering may be less suitable for data sets with pre-existing clusters of points and/or with isolated, distant sample locations. When distinct clusters of points are present, as in multi-level sampling, it may be better to define clusters using a factor variable. ([mlr3spatiotempcv reference](https://arxiv.org/pdf/2110.12674.pdf)) + +```{r} +set.seed(12345) +rsmp_clus <- rsmp("spcv_coords", + folds = 5) + +# rr_sp = resample(task = task, +# learner = learner, +# resampling = resampling_sp) + +autoplot(rsmp_clus, task, fold_id = c(1:5)) + # ggplot2::scale_x_continuous(breaks = seq(124.50, 124.62, 0.04)) * + # ggplot2::scale_y_continuous(breaks = seq(55.38, 55.48, 0.03)) +``` + +### Block spatial cross-validation + +This is the block method. Equal-sized blocks are created and then grouped to create the number of splits. The more columns/rows are created, the closer this method would get to a non-spatial cross validation method, so this is an important factor for this method. + +This method is probably not as well suited to non-rectangular study areas. + +```{r} +set.seed(12345) +rsmp_block <- rsmp("spcv_block", + folds = 5, + cols = 4, + rows = 4) + +autoplot(rsmp_block, task, fold_id = c(1:5), size = 1, show_blocks = TRUE) +``` + +### Temporal cross-validation + +Another concept for resampling is to design a strategy that is tailored to our specific dataset. We could do this to varying level of degrees, based on the predictors that are included in our data. + +One option is to simply divide the dataset into the three natural temporal divisions that we observe: (roughly) 1996, 2007, and 2011. First, we define the year column as a factor variable, then pass it to custom_cv as the split determination variable. However, this method produces drastically uneven folds - we have 213 observations from 1996, 137 from 2007, and only 47 from 2011. + +```{r} +set.seed(12345) +yr_splits <- as.factor(sample_pnts[!(is.na(sample_pnts$dist_to_road)),]$year) + +rsmp_temp <- rsmp("custom_cv") +rsmp_temp$instantiate(task, f = yr_splits) + +autoplot(rsmp_temp, task, fold_id = c(1:3)) +``` + +Another proposal was to divide by both time and lithology. For example: + +> Train on 1996 sedimentary + volcaniclastic,  +> +> test on 1996 other rock types, 2007 sedimentary+volcaniclastic, 2007 other rock types, 2011 sed+volcaniclastic, 2011 other rock types. + +This method is based in our physical understanding of how landslides occur, and will help to decide how much we can extrapolate a model that is built using a certain subset of landslides to other lithologies. + +```{r} +set.seed(12345) +yr_lith <- ((sample_pnts[!(is.na(sample_pnts$dist_to_road)),]$year) * sample_pnts[!(is.na(sample_pnts$dist_to_road)),]$geo) +yr_lith_splits <- as.factor(yr_lith) + +rsmp_templith <- rsmp("custom_cv") +rsmp_templith$instantiate(task, f = yr_lith_splits) + +autoplot(rsmp_templith, task, fold_id = c(1:11)) +``` + +I am going to ignore the leave-one-out methods for now since we don't have many data points, and so leaving any of them out would likely have a big impact on the results that we observe (I would expect it to report lower performance than we actually observe). + +Other otions we have are ("method ="): + +1. "sptcv_cstf" - leave-location-out, leave-time-out, or leave-location-time-out methods. Control which method is being used by parameters "time =" and "space =". I believe this is the only way to do temporal cross-validation (aside from "custom_cv"). + +## Evaluating the performance of each method. + +Now we need to get the performance metric calculated using each resampling method, so that we can compare them. This step is likely to take a bit of time. + +```{r} +methods = c(rsmp_block, rsmp_clus, rsmp_nonsp, rsmp_temp, rsmp_templith) +times = list() +model_perf = list() + +for (method in methods) { + build_time <- system.time(model <- resample(task= task, + learner = learner, + resampling = method)) + times <- append(times, build_time["elapsed"]) + model_perf <- append(model_perf, model$aggregate(measures = msr("classif.auc"))) +} + +``` + +```{r} +method_names = c("rsmp_block", "rsmp_clus", "rsmp_nonsp", "rsmp_temp", "rsmp_templith") + +to_plot <- c(model_perf[1]$classif.auc, + model_perf[2]$classif.auc, + model_perf[3]$classif.auc, + model_perf[4]$classif.auc, + model_perf[5]$classif.auc) +to_plot <- as.data.frame(to_plot) +to_plot$names <- method_names[1:5] + +ggplot(to_plot, aes(x = names, y = to_plot)) + + geom_bar(stat = "identity") + + coord_cartesian(ylim=c(0.8,1)) + + labs(y = "Area under the ROC curve", x = "Resampling method") + # scale_y_continuous(limits = c(0.5, 1)) +``` + +Based on literature, lets go with the clustering resampling method. diff --git a/inst/examples/dev_15.tif b/inst/examples/dev_15.tif new file mode 100644 index 0000000..9f96b8b Binary files /dev/null and b/inst/examples/dev_15.tif differ diff --git a/inst/examples/elev_scottsburg.flt b/inst/examples/elev_scottsburg.flt new file mode 100644 index 0000000..fd2bf50 Binary files /dev/null and b/inst/examples/elev_scottsburg.flt differ diff --git a/inst/examples/elev_scottsburg.flt.aux.xml b/inst/examples/elev_scottsburg.flt.aux.xml new file mode 100644 index 0000000..71412f7 --- /dev/null +++ b/inst/examples/elev_scottsburg.flt.aux.xml @@ -0,0 +1,6 @@ + + + elev_scottsburg + nan + + diff --git a/inst/examples/elev_scottsburg.hdr b/inst/examples/elev_scottsburg.hdr new file mode 100644 index 0000000..0de93fa --- /dev/null +++ b/inst/examples/elev_scottsburg.hdr @@ -0,0 +1,7 @@ +ncols 302 +nrows 327 +xllcorner 428592.38410541 +yllcorner 4827754.0301814 +cellsize 20 +NODATA_value nan +BYTEORDER LSBFIRST diff --git a/inst/examples/elev_scottsburg.prj b/inst/examples/elev_scottsburg.prj new file mode 100644 index 0000000..d306e34 --- /dev/null +++ b/inst/examples/elev_scottsburg.prj @@ -0,0 +1 @@ +PROJCS["NAD_1983_UTM_Zone_10N",GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Transverse_Mercator"],PARAMETER["False_Easting",500000.0],PARAMETER["False_Northing",0.0],PARAMETER["Central_Meridian",-123.0],PARAMETER["Scale_Factor",0.9996],PARAMETER["Latitude_Of_Origin",0.0],UNIT["Meter",1.0]] diff --git a/inst/examples/elev_scottsburg.stx b/inst/examples/elev_scottsburg.stx new file mode 100644 index 0000000..b3513cf --- /dev/null +++ b/inst/examples/elev_scottsburg.stx @@ -0,0 +1 @@ +1 2.1317412853 542.2497558594 -9999.0000000000 -9999.0000000000 diff --git a/inst/examples/elev_scottsburg.tif b/inst/examples/elev_scottsburg.tif new file mode 100644 index 0000000..a630bac Binary files /dev/null and b/inst/examples/elev_scottsburg.tif differ diff --git a/inst/examples/grad_15.tif b/inst/examples/grad_15.tif new file mode 100644 index 0000000..bd78151 Binary files /dev/null and b/inst/examples/grad_15.tif differ diff --git a/inst/examples/initiation_points.cpg b/inst/examples/initiation_points.cpg new file mode 100644 index 0000000..3ad133c --- /dev/null +++ b/inst/examples/initiation_points.cpg @@ -0,0 +1 @@ +UTF-8 \ No newline at end of file diff --git a/inst/examples/initiation_points.dbf b/inst/examples/initiation_points.dbf new file mode 100644 index 0000000..313c4e3 Binary files /dev/null and b/inst/examples/initiation_points.dbf differ diff --git a/inst/examples/initiation_points.prj b/inst/examples/initiation_points.prj new file mode 100644 index 0000000..3ed61eb --- /dev/null +++ b/inst/examples/initiation_points.prj @@ -0,0 +1 @@ +PROJCS["NAD_1983_UTM_Zone_10N",GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Transverse_Mercator"],PARAMETER["False_Easting",500000.0],PARAMETER["False_Northing",0.0],PARAMETER["Central_Meridian",-123.0],PARAMETER["Scale_Factor",0.9996],PARAMETER["Latitude_Of_Origin",0.0],UNIT["Meter",1.0]] \ No newline at end of file diff --git a/inst/examples/initiation_points.shp b/inst/examples/initiation_points.shp new file mode 100644 index 0000000..1827d95 Binary files /dev/null and b/inst/examples/initiation_points.shp differ diff --git a/inst/examples/initiation_points.shx b/inst/examples/initiation_points.shx new file mode 100644 index 0000000..aa4739a Binary files /dev/null and b/inst/examples/initiation_points.shx differ diff --git a/inst/examples/input_makegrids.txt b/inst/examples/input_makegrids.txt new file mode 100644 index 0000000..1df08a8 --- /dev/null +++ b/inst/examples/input_makegrids.txt @@ -0,0 +1,12 @@ +# Input file for makeGrids + + DEM: c:\work\Sandbox\git\TerrainWorksUtils\inst\examples\elev_scottsburg + SCRATCH DIRECTORY: C:\Work\scratch\ + LENGTH SCALE: 4.0 + GRID: GRADIENT, OUTPUT FILE = c:\temp\grad4.flt + GRID: PLAN CURVATURE, OUTPUT FILE = c:\temp\plan4.flt +# GRID: PROFILE CURVATURE, OUTPUT FILE = d:\data\AK\BirchCreek\prof20 +# GRID: NORMAL SLOPE CURVATURE, OUTPUT FILE = d:\data\AK\BirchCreek\profile20 +# GRID: TANGENTIAL CURVATURE, OUTPUT FILE = d:\data\AK\BirchCreek\tang20 +# GRID: GEP, OUTPUT FILE = c:\work\data\spain\base_layers\GEP10, MAX VALUE = 2.13, MIN SLOPE = 0.4 +# GRID: BCONTOUR, OUTPUT FILE = d:\data\AK\BirchCreek\bcon10 diff --git a/inst/examples/makegrids_input_sample.txt b/inst/examples/makegrids_input_sample.txt new file mode 100644 index 0000000..40d2a48 --- /dev/null +++ b/inst/examples/makegrids_input_sample.txt @@ -0,0 +1,11 @@ +# Input file for makeGrids +# Creating by input_file_utils.R +# On 2022-10-19 10:57:24 +DEM: \\Mac\Home\Documents\terrainworks\code\TerrainWorksUtils\inst\examples\elev_scottsburg +SCRATCH DIRECTORY: \\Mac\Home\Documents\terrainworks\code\TerrainWorksUtils\inst\scratch +LENGTH SCALE: 15 +GRID: GRADIENT, OUTPUT FILE = \\Mac/Home/Documents/terrainworks/code/TerrainWorksUtils/inst/scratch/grad.flt +GRID: PLAN CURVATURE, OUTPUT FILE = \\Mac/Home/Documents/terrainworks/code/TerrainWorksUtils/inst/scratch/plan.flt +GRID: PROFILE CURVATURE, OUTPUT FILE = \\Mac/Home/Documents/terrainworks/code/TerrainWorksUtils/inst/scratch/prof.flt +GRID: NORMAL SLOPE CURVATURE, OUTPUT FILE = \\Mac/Home/Documents/terrainworks/code/TerrainWorksUtils/inst/scratch/norm.flt +GRID: TANGENTIAL CURVATURE, OUTPUT FILE = \\Mac/Home/Documents/terrainworks/code/TerrainWorksUtils/inst/scratch/tan.flt diff --git a/inst/examples/partial_input_sample.txt b/inst/examples/partial_input_sample.txt new file mode 100644 index 0000000..38804ea --- /dev/null +++ b/inst/examples/partial_input_sample.txt @@ -0,0 +1,9 @@ +# Input file for makeGrids +# Creating by input_file_utils.R +# On 2022-10-19 15:05:51 +DEM: \\Mac\Home\Documents\terrainworks\code\TerrainWorksUtils\inst\examples\elev_scottsburg +SCRATCH DIRECTORY: \\Mac\Home\Documents\terrainworks\code\TerrainWorksUtils\inst\scratch +LENGTH SCALE: 15 +DURATION: 5 +CONDUCTIVITY: 1 +OUTPUT RASTER: \\Mac\Home\Documents\terrainworks\code\TerrainWorksUtils\inst\scratch\pca.flt diff --git a/inst/examples/pca_15m_48hr.tif b/inst/examples/pca_15m_48hr.tif new file mode 100644 index 0000000..0374476 Binary files /dev/null and b/inst/examples/pca_15m_48hr.tif differ diff --git a/inst/examples/plan_15.tif b/inst/examples/plan_15.tif new file mode 100644 index 0000000..9002ced Binary files /dev/null and b/inst/examples/plan_15.tif differ diff --git a/inst/examples/prof_15.tif b/inst/examples/prof_15.tif new file mode 100644 index 0000000..fabc820 Binary files /dev/null and b/inst/examples/prof_15.tif differ diff --git a/inst/examples/scottsburg.R b/inst/examples/scottsburg.R new file mode 100644 index 0000000..c5499ac --- /dev/null +++ b/inst/examples/scottsburg.R @@ -0,0 +1,34 @@ +library(terra) + +elev <- rast("E:/NetmapData/Scottsburg/elev_scottsburg.flt") +elev_agg <- aggregate(elev, fact = 10) +writeRaster(elev_agg, filename = "data-cache/elev_scottsburg.tif") +rm(elev) +rm(elev_agg) + +grad <- rast("E:/NetmapData/Scottsburg/grad_15.tif") +grad_agg <- aggregate(grad, fact = 10) +writeRaster(grad_agg, "data-cache/grad_15.tif") +rm(grad) + +dev <- rast("E:/Netmapdata/Scottsburg/dev_15.tif") +dev_agg <- aggregate(dev, fact = 10) +writeRaster(dev_agg, "data-cache/dev_15.tif") +rm(dev) + +prof <- rast("E:/NetmapData/Scottsburg/prof_15.tif") +prof_agg <- aggregate(prof, fact = 10) +writeRaster(prof_agg, "data-cache/prof_15.tif") +rm(prof) + +plan <- rast("E:/NetmapData/Scottsburg/plan_15.tif") +plan_agg <- aggregate(plan, fact = 10) +writeRaster(plan_agg, "data-cache/plan_15.tif") +rm(plan) + +pca_15m_48hr <- rast("E:/NetmapData/Scottsburg/pca_15m_48hr.flt") +pca_agg <- aggregate(pca_15m_48hr, fact = 10) +writeRaster(pca_agg, "data-cache/pca_15m_48hr.tif") + +points <- vect("E:/NetmapData/Scottsburg/Scottsburg_Upslope.shp") +writeVector(points, "data-cache/initiation_points.shp") diff --git a/inst/extdata/oregon_map.Rdata b/inst/extdata/oregon_map.Rdata new file mode 100644 index 0000000..3649c46 Binary files /dev/null and b/inst/extdata/oregon_map.Rdata differ