Homework

Home page

library(dplyr)
library(tidyr)

Question 1

##################################################
# function: file_builder
# create a set of random files for regression
# input: file_n = number of files to create
#       : file_folder = name of folder for random files
#       : file_size = c(min,max) number of rows in file
#       : file_na = number on average of NA values per column
# output: set of random files
#------------------------------------------------- 
file_builder <- function(file_n=10,
                        file_folder="Data/RandomFiles/",
                        file_size=c(15,100),
                        file_na=3){
for (i in seq_len(file_n)) {
file_length <- sample(file_size[1]:file_size[2],size=1) # get number of rows
var_x <- runif(file_length,max=0.75) # create random x with predetermined max
var_y <- runif(file_length,min=0.25) # create random y with predetermined min
df <- data.frame(var_x,var_y) # bind into a data frame
bad_vals <- rpois(n=1,lambda=file_na) # determine NA number
df[sample(nrow(df),size=bad_vals),1] <- NA # random NA in var_x
df[sample(nrow(df),size=bad_vals),2] <- NA # random NA in var_y

# create label for file name with padded zeroes
file_label <- paste(file_folder,
                       "ranFile",
                       formatC(i,
                       width=3,
                       format="d",
                       flag="0"),
                       ".csv",sep="")

# set up data file and incorporate time stamp and minimal metadata
write.table(cat("# Simulated random data file for batch processing","\n",
                    "# timestamp: ",as.character(Sys.time()),"\n",
                    "# NBB","\n",
                    "# ------------------------", "\n",
                    "\n",
                    file=file_label,
                    row.names="",
                    col.names="",
                    sep=""))

# now add the data frame
write.table(x=df,
            file=file_label,
            sep=",",
            row.names=FALSE,
            append=TRUE)


}
}

##################################################
# function: t_stats
# performs t test, returns p value
# input: 2-column data frame (x and y)
# output: p-value
#------------------------------------------------- 
t_stats <- function(d=NULL) {
             if(is.null(d)) {
               x_var <- runif(10,max=0.75)
               y_var <- runif(10,min=0.25)
               d <- data.frame(x_var,y_var)
             }
  t <- t.test(x=d[,1],y=d[,2],alternative='less')
  return(t$p.value)

}

#--------------------------------------------
# Global variables
file_folder <- "Data/RandomFiles/"
n_files <- 100
file_out <- "Data/StatsSummary.csv"
#--------------------------------------------

# Create 100 random data sets
dir.create(file_folder)
file_builder(file_n=n_files)
file_names <- list.files(path=file_folder)

# Create data frame to hold file summary statistics
ID <- seq_along(file_names)
file_name <- file_names
slope <- rep(NA,n_files)
p_val <- rep(NA,n_files)
r2 <- rep(NA,n_files)

stats_out <- data.frame(ID,file_name,p_val)

# batch process by looping through individual files
for (i in seq_along(file_names)) {
  data <- read.table(file=paste(file_folder,file_names[i],sep=""),
                     sep=",",
                     header=TRUE) # read in next data file
  
  d_clean <- data[complete.cases(data),] # get clean cases
  
  . <- t_stats(d_clean) # pull regression stats from clean file
  stats_out[i,3] <- . # copy into last column
  
}
# set up output file and incorporate time stamp and minimal metadata
  write.table(cat("# Summary stats for batch processing of t-tests","\n",
                    "# timestamp: ",as.character(Sys.time()),"\n",
                    "# NBB","\n",
                    "# ------------------------", "\n",
                    "\n",
                    file=file_out,
                    row.names="",
                    col.names="",
                    sep=""))
  
# now add the data frame
  write.table(x=stats_out,
              file=file_out,
              row.names=FALSE,
              col.names=TRUE,
              sep=",",
              append=TRUE)

df <- read.table("Data/StatsSummary.csv",sep=',',header = TRUE)
head(df)

##   ID      file_name        p_val
## 1  1 ranFile001.csv 2.544941e-12
## 2  2 ranFile002.csv 9.040247e-03
## 3  3 ranFile003.csv 3.310240e-17
## 4  4 ranFile004.csv 1.487455e-03
## 5  5 ranFile005.csv 7.475342e-15
## 6  6 ranFile006.csv 1.835452e-09

Question 2

(Continuation of Homework 10)

# Initializing a vector with some random number of zeroes
vect=sample(c(-10:10),size=100,replace=T)

# This line of code returns the number of zeroes in that vector
length(subset(vect,vect==0))

## [1] 3

Question 3

##################################################
# FUNCTION: product_matrix
# creates a matrix where each element is the product of its respective row # and col #
# input: nrows, ncols -> number of rows and cols (ints)
# output: mat -> matrix of size nrows, ncols
#------------------------------------------------- 
product_matrix <- function(nrows=runif(1,min=1,max=10),ncols=runif(1,min=1,max=10)){
  mat=matrix(data=NA, nrow=nrows,ncol=ncols)
  if(nrows<=0 | ncols<=0){
    return(message("Error: nrows and ncols must both be greater than zero"))
  }
  for(i in 1:nrow(mat)){
    for(j in 1:ncol(mat)){
      mat[i,j] <- i*j
      }
  }
 return(mat)
}
product_matrix()

##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    2    4    6
## [3,]    3    6    9
## [4,]    4    8   12
## [5,]    5   10   15
## [6,]    6   12   18

Question 4

Function to read in (or generate) the data

# can't forget to set seed
set.seed(42)

####### ###########################################
# function: read_data
# read in (or generate) data set for analysis
# input: file name (or nothing, for this demo)
# output: 3 column data frame of observed data (x,y)
#------------------------------------------------- 
read_data <- function(z=NULL) {
  if(is.null(z)){
    # random group assignment
    DO_CHEMISTRY <- sample(c('YES','NO'),20,T)
    # minimum MW is 15 (methane), max chosen arbitrarily
    MW <- sample(15:1000,20,F)
    z <- data.frame(INDEX=1:20,DO_CHEMISTRY,MW)} 
  else {
    z <-read.table(file=z,header=TRUE,sep=",") %>% 
      select(CPD_ID,DO_CHEMISTRY, MW)}
  return(z)
}
df <- read_data("Data/NIHMS50746-supplement-AZ_chemsurvey_QED.csv")

Function to calculate the means for each treatment group

##################################################
# function: get_metric
# perform t-test for means of the two treatment groups for randomization test
# input: 3-column data frame for t-test
# output: p-value from t-test
#------------------------------------------------- 
get_means <- function(z=NULL) {
  if(is.null(z)){
    # random group assignment
    DO_CHEMISTRY <- sample(c('YES','NO'),20,T)
    # minimum MW is 15 (methane), max chosen arbitrarily
    MW <- sample(15:1000,20,F)
    z <- data.frame(INDEX=1:20,DO_CHEMISTRY,MW)} # set up data frame                 
    group_means <- z %>% group_by(z$DO_CHEMISTRY) %>% 
      summarise(mean=mean(z$MW))
  # Row names bug needs to be addressed
  t <- data.frame(t(group_means))[-1,]
  names(t) <- c("NO","YES")
  return(t)
}
get_means(df)

##            NO      YES
## mean 364.9435 364.9435

Function to perform t-test and return p-value

##################################################
# function: get_pval
# calculate p value from simulation
# input: dataframe of randomized (or real) data
# output: t-test p-value
#------------------------------------------------- 
get_pval <- function(z=NULL) {
  if(is.null(z)){
    z <- data.frame(NO=sample(15:1000,20),YES=sample(15:1000,20))}
  else{
    t <- pivot_wider(z,id_cols=CPD_ID,names_from=DO_CHEMISTRY,values_from=MW)}
  pval <- t.test(t[,"YES"],t[,"NO"])$p.value
return(pval)
}
get_pval(df)

## [1] 5.590193e-18

Function to randomize the data

##################################################
# function: shuffle_data
# randomize data for regression analysis
# input: 2-column data frame (x_var,y_var)
# output: 2-column data frame (x_var,y_var)
#------------------------------------------------- 
shuffle_data <- function(z=NULL) {
  if(is.null(z)){
    # random group assignment
    DO_CHEMISTRY <- sample(c('YES','NO'),20,T)
    # minimum MW is 15 (methane), max chosen arbitrarily
    MW <- sample(15:1000,20,F)
    z <- data.frame(INDEX=1:20,DO_CHEMISTRY,MW)}
  z[,2] <- sample(z[,2])
  return(z)
}
head(shuffle_data(df))

##   CPD_ID DO_CHEMISTRY       MW
## 1      1           NO 287.4197
## 2      2          YES 333.4054
## 3      3           NO 296.7758
## 4      4           NO 311.3749
## 5      5           NO 297.3484
## 6      6           NO 397.2404

Putting it all together in program body

# p-vlaue from the randomized data
p_rand <- get_pval(shuffle_data(read_data("Data/NIHMS50746-supplement-AZ_chemsurvey_QED.csv")))
p_reg <- get_pval(read_data("Data/NIHMS50746-supplement-AZ_chemsurvey_QED.csv"))

p_rand_vect <- c()
for(i in 1:100){
  p_rand_vect <- append(p_rand_vect,get_pval(shuffle_data(read_data("Data/NIHMS50746-supplement-AZ_chemsurvey_QED.csv"))))
}

p-value for single randomization test vs that of the regular data

# p-value from unrandomized data is vastly smaller than that of the randomized data
data.frame(p_rand,p_reg)

##     p_rand        p_reg
## 1 0.765171 5.590193e-18

Average p-value for 100 randomization tests vs that of the regular data

data.frame(mean(p_rand_vect),p_reg)

##   mean.p_rand_vect.        p_reg
## 1         0.5093189 5.590193e-18

Even after 100 randomization tests, there is still a significant difference between the average p-value of the randomization tests compared to the p-value of the un-randomized data. This is strong evidence that there is an actual difference in the molecular weight of chemicals deemed “attractive” versus those labelled “unattractive” by medicinal chemists. This makes sense, because medicinal chemists likely know that good drugs usually aren’t big, heavy molecules.

Homework_11

Noah Beckage

4/6/2022