Improving performance of iteration in R

  Kiến thức lập trình

I originally created a for loop to calculate numerous variables that were sometimes dependent on previous iterations e.g. [i] and [i-1].

To improve the performance for larger datasets I attempted to use vectorisation and produced the following code, which is quicker. The dataframe I hope to use this code on has a length of ~200,000 observations. Is there anyway I can improve the performance of this code further? I hope to run different scenarios in the future, which will also increase computing time.

Thanks for your help!

# Load required packages
library(dplyr)
library(purrr)

# Set seed for reproducibility
set.seed(42)

# Generate random data for storm
n <- 10000  # number of rows
storm <- data.frame(
  date_time = seq.POSIXt(from = as.POSIXct("2021-01-01"), by = "day", length.out = n),
  temp = runif(n, min = -10, max = 35),
  Rain = runif(n, min = 0, max = 50),
  PET = runif(n, min = 0, max = 10),
  Qin_m3 = runif(n, min = 0, max = 100)
)

# Placeholder functions
TSA_depth.fn <- function(volume, TSA_dimensions) {
  # Example behavior: return volume divided by some constant
  return(volume / TSA_dimensions)
}

soil_infiltration.fn <- function(volume, MRC_df, MRC_name, pipe_base_vol) {
  # Example behavior: return volume times some fraction
  return(volume * 0.1)
}

outlet_pipe.fn <- function(diameter, waterDepth, pipe_base, Cd, maxTSA_height) {
  # Example behavior: return some function of the inputs
  return(diameter * waterDepth * Cd)
}

# Constants
maxTSA_area <- 1000
TSA_dimensions <- 50
pipe_diameter <- 0.5
pipe_base_m <- 0.1
Cd <- 0.6
maxTSA_height <- 2.5
maxTSA_volume <- 2000
MRC_df <- data.frame()  # Example placeholder
MRC_name <- "example"
pipe_base_vol <- 0.1

# Initial data preparation
TSA_model <- storm %>%
  select(date_time, temp, Rain, PET, Qin_m3) %>%
  mutate(Qin_m3 = replace(Qin_m3, 1, 0))

# Initialize new columns with 0 and precompute PET_m3
TSA_model <- TSA_model %>%
  mutate(S = 0, dS = 0, depth = 0, PET_m3 = round((PET / 1000) * maxTSA_area, digits = 3), 
         soil_m3 = 0, pipe_m3 = 0, overflow_m3 = 0, Qout = 0)

# Function to calculate the values for the rows
calculate_values <- function(df) {
  n <- nrow(df)
  for (i in 2:n) {
    df$dS[i] <- df$Qin_m3[i] - df$Qout[i - 1]
    df$S[i] <- df$S[i - 1] + df$dS[i]
    df$depth[i] <- TSA_depth.fn(volume = df$S[i], TSA_dimensions = TSA_dimensions)
    
    soil_infiltration <- soil_infiltration.fn(volume = df$S[i], MRC_df = MRC_df, MRC_name = MRC_name, pipe_base_vol = pipe_base_vol)
    df$soil_m3[i] <- max(soil_infiltration - df$PET_m3[i], 0)
    df$soil_m3[i] <- ifelse(df$S[i] - df$soil_m3[i] < 0, df$S[i], df$soil_m3[i])
    
    pipe_outflow <- outlet_pipe.fn(diameter = pipe_diameter, waterDepth = df$depth[i], pipe_base = pipe_base_m, Cd = Cd, maxTSA_height = maxTSA_height)
    df$pipe_m3[i] <- max(pipe_outflow, 0)
    
    df$overflow_m3[i] <- ifelse(df$S[i] > maxTSA_volume,
                                max(df$S[i] - maxTSA_volume - df$pipe_m3[i] - df$soil_m3[i] - df$PET_m3[i], 0),
                                0)
    
    df$Qout[i] <- df$PET_m3[i] + df$soil_m3[i] + df$pipe_m3[i] + df$overflow_m3[i]
    df$Qout[i] <- ifelse(df$S[i] - df$Qout[i] < 0, df$S[i], df$Qout[i])
  }
  return(df)
}

# Calculate the new values
TSA_model <- calculate_values(TSA_model)

# Display the updated TSA_model
str(TSA_model)
summary(TSA_model)

LEAVE A COMMENT