-
Notifications
You must be signed in to change notification settings - Fork 0
Data Cleaning
kirkvanacore edited this page May 24, 2021
·
1 revision
Basic subset of rows based on criteria
new <- subset(old, old$variable > 4 | old$variable > 4)
Subset only a few columns
attr <- attr[4:12]
Subset rows and only keeping certain columns
new <- subset(old, old$variable > 4 | old$variable > 4, select=c(var1, var2))
Subset with list
c_perf <- subset(perf, perf$task_id %in% c(A, B, C, D, E)
###Merge or Join Data
merge two data frames by ID
total <- merge(data frameA,data frameB,by="ID")
join two dataframes using the dplyr package (analog to SQL joins)
analysis <- left_join(pc_perf,attr,by = "task_id")
Rename Variables
# Many options for this. Two are below
#Rename the second column of the cars df as Stopping Distance (ft)
colnames(cars)[2] <-"Stopping Distance (ft)"
###Rename the variable that is currently "dist" of the cars df as Stopping Distance (ft)
cars %>%
rename("Stopping Distance (ft)" = dist) %>%
colnames()
Missing data Count NAs in all columns of a df
df %>%
select(everything()) %>%
summarise_all(funs(sum(is.na(.))))
vou19_20$Date <- mdy_hms((vou19_20$StartDate))
###Create Percentile Rank Variable
df$percentile <- ecdf(df$score)(df$score)