R rename columns (hlookup) - r

I have the following data
data <- data.frame(c=1:5, ID=0,a=2,b=5:9)
naming <- data.frame(short=c("a","b","c", "d", "e"), long=c("aa","bb","cc", "dd", "ee"))
I would like to rename the columns in data frame data from c,ID,b,a to cc,ID,bb,aa
I tried:
colnames(data) <- naming[match(naming$short, colnames(data)),2]
but this does not work, as both vectors are not of the same length, further, I would like to keep the column names in data, that are not in naming.
Any suggestions? Basically its hlookup function from Excel, but due to large data files I cannot do this in Excel.

Base R:
matches <- match(names(data), naming$short)
matches
# [1] 3 NA 1 2
ifelse(is.na(matches), names(data), naming$long[matches])
# [1] "cc" "ID" "aa" "bb"
names(data) <- ifelse(is.na(matches), names(data), naming$long[matches])
data
# cc ID aa bb
# 1 1 0 2 5
# 2 2 0 2 6
# 3 3 0 2 7
# 4 4 0 2 8
# 5 5 0 2 9

using dplyr 's rename_at and passing an anonymous functions seems to fit your purposes
require(dplyr)
data <- data.frame(c=1:5, ID=0,a=2,b=5:9)
data
# c ID a b
# 1 1 0 2 5
# 2 2 0 2 6
# 3 3 0 2 7
# 4 4 0 2 8
# 5 5 0 2 9
cols_iwant_to_rename <- c("a","b","c")
data <-
data %>% rename_at(cols_iwant_to_rename , function(x) paste0(x, x))
data
# cc ID aa bb
# 1 1 0 2 5
# 2 2 0 2 6
# 3 3 0 2 7
# 4 4 0 2 8
# 5 5 0 2 9

Related

R joining on counts of elements in a vector to matching index in a dataframe

I have a dataframe df that looks like this:
indx
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
df<-structure(list(indx = 1:10), row.names = c(NA, 10L), class = "data.frame")
And a vector vec that looks like this:
vec<-c(3,5,9,9,5,4,3,3)
I would like to find the counts of each element in vec e.g. by doing this:
vec_counts<-table(vec)
3 4 5 9
3 1 2 2
And then join these counts into the matching indx in df. The final result should be:
indx vec_counts
1 1 0
2 2 0
3 3 3
4 4 1
5 5 2
6 6 0
7 7 0
8 8 0
9 9 2
10 10 0
You can use merge.
x <- merge(df, as.data.frame(table(vec)), by.x = "indx", by.y = "vec", all.x = TRUE)
names(x) <- c("indx", "vec_counts")
x$vec_counts[is.na(x$vec_counts)] <- 0
x
# indx vec_counts
# 1 1 0
# 2 2 0
# 3 3 3
# 4 4 1
# 5 5 2
# 6 6 0
# 7 7 0
# 8 8 0
# 9 9 2
# 10 10 0
Also you can use outer() to calculate it in one quick step without table().
df$vec_counts <- rowSums(outer(df$indx, vec, `==`))
# or with dplyr
library(dplyr)
df %>%
mutate(vec_counts = rowSums(outer(indx, vec, `==`)))
A possible solution, based on tidyverse:
library(tidyverse)
df %>%
mutate(count = map_dbl(indx, ~ sum(.x == vec)))
#> indx count
#> 1 1 0
#> 2 2 0
#> 3 3 3
#> 4 4 1
#> 5 5 2
#> 6 6 0
#> 7 7 0
#> 8 8 0
#> 9 9 2
#> 10 10 0
Or in base R:
df$count <- apply(df, 1, \(x) sum(x[1] == vec))
Or even, still in base R, using sapply:
df$count <- sapply(df$indx, \(x) sum(x == vec))

how to remove part of a phrase from all variables in a column in R

say I have a Data frame
g <- c("Smember_1", "Smember_1", "Smember_1", "Smember_2", "Smember_2", "Smember_2", "Smember_3", "Smember_3", "Smember_3")
m <- c(1,2,1,3,4,1,3,5,6)
df <- data.frame(g, m)
g m
1 Smember_1 1
2 Smember_1 2
3 Smember_1 1
4 Smember_2 3
5 Smember_2 4
6 Smember_2 1
7 Smember_3 3
8 Smember_3 5
9 Smember_3 6
I would like to remove Smember_ in from all the variables in the g column such that the data frame df looks like
> df
g m
1 1 1
2 1 2
3 1 1
4 2 3
5 2 4
6 2 1
7 3 3
8 3 5
9 3 6
I think you want
df$g <- gsub(".*(\\d+)$", "\\1", df$g)
df2$variable <- gsub("Smember_","", df2$variable)
worked!

R: populating a column of a data frame based on results of simulation

Question continued:
Remove duplicate outcomes, when outcomes are strings and not in the same order
I want to create a data frame with the possible outcomes of rolling two dice. The point of this is to run a simulation separately and populate the data frame with the number of outcomes. I wrote the following code to create the data frame:
# Create variables in data frame
dice1 <- sort(rep(1:6,6))
dice2 <- rep(1:6,6)
dicesum <- dice1 + dice2
# Assign variables to data frame
df <- data.frame(dice1, dice2, dicesum)
# Remove duplicates
inx <- duplicated(t(apply(df, 1, sort)))
df <- df[!inx, ]
rownames(df) <- 1:nrow(df)
# initiate a column that holds the simulation outcome count
df["count"] <- numeric(nrow(df))
> str(df)
'data.frame': 21 obs. of 4 variables:
$ dice1 : int 1 1 1 1 1 1 2 2 2 2 ...
$ dice2 : int 1 2 3 4 5 6 2 3 4 5 ...
$ dicesum: int 2 3 4 5 6 7 4 5 6 7 ...
$ count : num 0 0 0 0 0 0 0 0 0 0 ...
> head(df)
dice1 dice2 dicesum count
1 1 1 2 0
2 1 2 3 0
3 1 3 4 0
4 1 4 5 0
5 1 5 6 0
6 1 6 7 0
# Simulate dice rolls
sim_dice1 <- sample(1:6, 100, replace = T)
sim_dice2 <- sample(1:6, 100, replace = T)
# Data frame with simulations
rolls <- data.frame(sim_dice1, sim_dice2)
> str(rolls)
'data.frame': 100 obs. of 2 variables:
$ sim_dice1: int 2 1 5 2 4 2 1 4 6 1 ...
$ sim_dice2: int 6 5 4 1 4 5 4 5 6 2 ...
> head(rolls)
sim_dice1 sim_dice2
1 2 6
2 1 5
3 5 4
4 2 1
5 4 4
6 2 5
What is the best way to populate the "count" column in df with the outcomes of the simulation? Note that the simulation data frame is has duplicate outcomes - I consider a (1,6) and a (6,1) a duplicate outcome.
We can use the dplyr package to achieve this task.
library(dplyr)
# Create and count the number of each Group
rolls2 <- rolls %>%
rowwise() %>%
mutate(Group = toString(sort(c(sim_dice1, sim_dice2)))) %>%
ungroup() %>%
count(Group)
# Create the Group name
df2 <- df %>%
rowwise() %>%
mutate(Group = toString(sort(c(dice1, dice2))))
# Perform merge between df2 and rolls2
df3 <- df2 %>%
left_join(rolls2, by = "Group") %>%
select(-Group) %>%
rename(count = n) %>%
replace(is.na(.), 0)
df3
Source: local data frame [21 x 4]
Groups: <by row>
# A tibble: 21 x 4
dice1 dice2 dicesum count
<int> <int> <int> <dbl>
1 1 1 2 0
2 1 2 3 5
3 1 3 4 5
4 1 4 5 8
5 1 5 6 4
6 1 6 7 5
7 2 2 4 2
8 2 3 5 8
9 2 4 6 7
10 2 5 7 7
# ... with 11 more rows
DATA
# Create variables in data frame
dice1 <- sort(rep(1:6,6))
dice2 <- rep(1:6,6)
dicesum <- dice1 + dice2
# Assign variables to data frame
df <- data.frame(dice1, dice2, dicesum)
# Remove duplicates
inx <- duplicated(t(apply(df, 1, sort)))
df <- df[!inx, ]
rownames(df) <- 1:nrow(df)
# Set seed for the reproducibility
set.seed(123)
# Simulate dice rolls
sim_dice1 <- sample(1:6, 100, replace = T)
sim_dice2 <- sample(1:6, 100, replace = T)
# Data frame with simulations
rolls <- data.frame(sim_dice1, sim_dice2)
Is this what you are looking for:
> # reduce to 10 simulation for illustration
> set.seed(17699398)
> sim_dice1 <- sample(1:6, 10, replace = T)
> sim_dice2 <- sample(1:6, 10, replace = T)
>
> sim_sum <- sim_dice1 + sim_dice2
>
> # print for illustration
> cbind(sim_dice1, sim_dice2, sim_sum)
sim_dice1 sim_dice2 sim_sum
[1,] 6 5 11
[2,] 3 1 4
[3,] 3 2 5
[4,] 6 5 11
[5,] 3 6 9
[6,] 3 2 5
[7,] 1 5 6
[8,] 1 2 3
[9,] 2 4 6
[10,] 2 2 4
>
> # make table
> sim_outcome <- table(sim_sum)
> sim_outcome
sim_sum
3 4 5 6 9 11
1 2 2 2 1 2
>
>
> # use that df and returned object from table function is sorted
> df$count[match(as.integer(names(sim_outcome)), df$dicesum)] <- sim_outcome
>
> df
dice1 dice2 dicesum count
1 1 1 2 0
2 1 2 3 1
3 1 3 4 2
4 1 4 5 2
5 1 5 6 2
6 1 6 7 0
7 2 2 4 0
8 2 3 5 0
9 2 4 6 0
10 2 5 7 0
11 2 6 8 0
12 3 3 6 0
13 3 4 7 0
14 3 5 8 0
15 3 6 9 1
16 4 4 8 0
17 4 5 9 0
18 4 6 10 0
19 5 5 10 0
20 5 6 11 2
21 6 6 12 0

Iterate over a subset of column names

I am new to R but here I have a dataframe of multiple measurements of a couple of conditions, I would like to perform a nested loop over the columns of the same condition, test if they have two true measurements (not zero) at least, if so calculate the mean of these specific conditions in a new dataset.
> sample <- list(c(8,0,12,5,0,11), c(15,5,0,10,12,13), c(1,1,0,3,0,9),
c(11,9,8,0,4,7), c(12,5,5,0,9,0), c(1,7,2,0,8,0))
> sample <- as.data.frame(sample)
> colnames(sample) <- c("x.1","x.2","x.3","y.1","y.2","y.3")
> sample
x.1 x.2 x.3 y.1 y.2 y.3
1 8 15 1 11 12 1
2 0 5 1 9 5 7
3 12 0 0 8 5 2
4 5 10 3 0 0 0
5 0 12 0 4 9 8
6 11 13 9 7 0 0
My output dataset should ideally look like this:
> Newsample
x y
1 8 8
2 2 7
3 0 5
4 6 0
5 0 7
6 11 0
We define f_rowmean function:
f_rowmean <- function(y) apply(y,1, function(x) ifelse(sum(x!=0)>=2, mean(x), 0))
And then:
data.frame(x=f_rowmean(sample[,grep("x", names(sample))]),
y=f_rowmean(sample[,grep("y", names(sample))]))
# x y
# 1 8 8
# 2 2 7
# 3 0 5
# 4 6 0
# 5 0 7
# 6 11 0
EDIT
As for OP's new problem statement (in comments), suppose your data set is in df1, then you could do:
res.cols <- c("CAOV-3 Reg", "CAOV-3 Mod", "OVCAR-3Reg", "OVCAR-4Reg", "VOA1056Reg",
"VOA4698Reg", "VOA4698Mod", "TOV112DReg", "TOV112DMod", "TOV21G Mod",
"HCC38 Reg", "HCC38 Mod")
res <- setNames(data.frame(matrix(0,nrow(df1),length(res.cols))), res.cols)
res <- sapply(res.cols, function(x) res[,x] <- f_rowmean(df1[,grep(x, names(df1))]))
We loop through the index of 'x' and 'y' columns in a list, get the rowSums of logical matrix and use ifelse to get the rowMeans
data.frame(setNames(lapply(list(grep("^x", names(sample)),
grep("^y", names(sample))), function(i) {
x1 <- sample[i]
ifelse(rowSums(x1!=0)>1, rowMeans(x1), 0)}), c("x", "y")))
# x y
#1 8 8
#2 2 7
#3 0 5
#4 6 0
#5 0 7
#6 11 0

R language check missing data for columns and rows

I have a data frame sells and I want to check the missing data in both rows and columns
What I did for rows is:
sells[, complete.cases(sells)]
nrows(sells[, complete.cases(sells)])
but I didn't know who to solve if for columns
Help please
First let's take the iris dataframe and insert randomly some NA's:
iris.demo <- iris
iris.nas <- matrix(as.logical(sample(FALSE:TRUE, size = 150*5,
prob = c(.9,.1),replace = TRUE)),ncol = 5)
iris.demo[iris.nas] <- NA
For rows, it is pretty straightforward:
sum(complete.cases(iris.demo))
# [1] 75
For columns, two possibilities (among several possible others):
Transposing the whole dataframe
sum(complete.cases(t(iris.demo)))
# [1] 0 # 0 columns are complete
Using lapply to count the "non-missing" on every column and see if it's equal to nrow:
sum(lapply(iris.demo, function(x) sum(!is.na(x))) == nrow(iris.demo))
# [1] 0
You could do it like this:
set.seed(1)
(sells <- data.frame(replicate(2, sample(c(1:3, NA), 10, T)), x3 = 1:10))
# X1 X2 x3
# 1 NA 2 1
# 2 1 3 2
# 3 3 2 3
# 4 1 1 4
# 5 2 NA 5
# 6 2 3 6
# 7 1 NA 7
# 8 2 1 8
# 9 NA 3 9
# 10 2 2 10
Rows:
sells[complete.cases(sells), ]
# X1 X2 x3
# 1 2 1 1
# 2 2 1 2
# 3 3 3 3
# 9 3 2 9
nrow(sells[complete.cases(sells), ])
# [1] 6
Columns:
sells[, sapply(sells, function(col) any(is.na(col)))]
# X1 X2
# 1 2 1
# 2 2 1
# 3 3 3
# 4 NA 2
# 5 1 NA
# 6 NA 2
# 7 NA 3
# 8 3 NA
# 9 3 2
# 10 1 NA
sum(sapply(sells, function(col) any(is.na(col))))
# [1] 2

Resources