Changing specific element in various variables - r

I've got variables in a dataset like this:
dat1 <- read.table(header=TRUE, text="
comp_T1_01 comp_T1_02 comp_T1_03 res_T1_01 res_T1_02 res_T1_03 res_T1_04
1 1 2 1 5 5 5
2 1 3 3 4 4 1
3 1 3 1 3 2 2
4 2 5 5 3 2 2
5 1 4 1 2 1 3
")
I would like erase the "T1" of all the variables at once. As I have over 100 Variables the "colnames" would be a bit too complicated.
Is there a command that can do that?
Thank you!

You can use sub :
names(dat1) <- sub('_T1', '', names(dat1))
dat1
# comp_01 comp_02 comp_03 res_01 res_02 res_03 res_04
#1 1 1 2 1 5 5 5
#2 2 1 3 3 4 4 1
#3 3 1 3 1 3 2 2
#4 4 2 5 5 3 2 2
#5 5 1 4 1 2 1 3
In dplyr, you can use rename_with :
library(dplyr)
dat1 %>% rename_with(~sub('_T1', '', .))

We can use str_remove
library(dplyr)
library(stringr)
dat1 %>%
rename_all(~ str_remove(., '_T1'))

this may also work,
names(dat1) <- gsub(x = names(dat1), pattern = "\\_T1", replacement = "")
dat1
comp_01 comp_02 comp_03 res_01 res_02 res_03 res_04
1 1 1 2 1 5 5 5
2 2 1 3 3 4 4 1
3 3 1 3 1 3 2 2
4 4 2 5 5 3 2 2
5 5 1 4 1 2 1 3

Related

identify whenever values repeat in r

I have a dataframe like this.
data <- data.frame(Condition = c(1,1,2,3,1,1,2,2,2,3,1,1,2,3,3))
I want to populate a new variable Sequence which identifies whenever Condition starts again from 1.
So the new dataframe would look like this.
Thanks in advance for the help!
data <- data.frame(Condition = c(1,1,2,3,1,1,2,2,2,3,1,1,2,3,3),
Sequence = c(1,1,1,1,2,2,2,2,2,2,3,3,3,3,3))
base R
data$Sequence2 <- cumsum(c(TRUE, data$Condition[-1] == 1 & data$Condition[-nrow(data)] != 1))
data
# Condition Sequence Sequence2
# 1 1 1 1
# 2 1 1 1
# 3 2 1 1
# 4 3 1 1
# 5 1 2 2
# 6 1 2 2
# 7 2 2 2
# 8 2 2 2
# 9 2 2 2
# 10 3 2 2
# 11 1 3 3
# 12 1 3 3
# 13 2 3 3
# 14 3 3 3
# 15 3 3 3
dplyr
library(dplyr)
data %>%
mutate(
Sequence2 = cumsum(Condition == 1 & lag(Condition != 1, default = TRUE))
)
# Condition Sequence Sequence2
# 1 1 1 1
# 2 1 1 1
# 3 2 1 1
# 4 3 1 1
# 5 1 2 2
# 6 1 2 2
# 7 2 2 2
# 8 2 2 2
# 9 2 2 2
# 10 3 2 2
# 11 1 3 3
# 12 1 3 3
# 13 2 3 3
# 14 3 3 3
# 15 3 3 3
This took a while. Finally I find this solution:
library(dplyr)
data %>%
group_by(Sequnce = cumsum(
ifelse(Condition==1, lead(Condition)+1, Condition)
- Condition==1)
)
Condition Sequnce
<dbl> <int>
1 1 1
2 1 1
3 2 1
4 3 1
5 1 2
6 1 2
7 2 2
8 2 2
9 2 2
10 3 2
11 1 3
12 1 3
13 2 3
14 3 3
15 3 3

Rename all column names that include a certain string

Let's say I have a dataset with 6 columns, and I want to replace the strings in the names of the columns including the string "likes_comment" through the string "number_likes".
Example data:
da = data.frame(likes_comment_1 = c(1,2,3,4), likes_comment_2 = c(2,2,3,1), likes_comment_3=c(5,2,3,1), quotes_comment1=c(2,1,3,4), quotes_comment2_=c(3,5,7,1), quotes_comment3=c(2,3,1,2))
da
likes_comment_1 likes_comment_2 likes_comment_3 quotes_comment1 quotes_comment2_ quotes_comment3
1 1 2 5 2 3 2
2 2 2 2 1 5 3
3 3 3 3 3 7 1
4 4 1 1 4 1 2
Target data:
target_da = data.frame(number_likes_1 = c(1,2,3,4), number_likes_2 = c(2,2,3,1), number_likes_3=c(5,2,3,1), quotes_comment1=c(2,1,3,4), quotes_comment2_=c(3,5,7,1), quotes_comment3=c(2,3,1,2))
target_da
number_likes_1 number_likes_2 number_likes_3 quotes_comment1 quotes_comment2_ quotes_comment3
1 1 2 5 2 3 2
2 2 2 2 1 5 3
3 3 3 3 3 7 1
4 4 1 1 4 1 2
How can I do this?
You can use rename_with -
library(dplyr)
library(stringr)
da %>% rename_with(~str_replace(., 'likes_comment', 'number_likes'))
# number_likes_1 number_likes_2 number_likes_3 quotes_comment1
#1 1 2 5 2
#2 2 2 2 1
#3 3 3 3 3
#4 4 1 1 4
# quotes_comment2_ quotes_comment3
#1 3 2
#2 5 3
#3 7 1
#4 1 2
Use sub:
names(da) <- sub("likes_comment_(\\d+)", "number_likes_\\1", names(da))
Does this work:
names(da)[grepl('likes_comment',names(da))] <- gsub('likes_comment','number_likes',names(da)[grepl('likes_comment',names(da))])
da
number_likes_1 number_likes_2 number_likes_3 quotes_comment1 quotes_comment2_ quotes_comment3
1 1 2 5 2 3 2
2 2 2 2 1 5 3
3 3 3 3 3 7 1
4 4 1 1 4 1 2
Since OP is tagged with data.table:
library(data.table)
setnames(da, sub('likes_comment', 'number_likes', names(da), fixed = TRUE))

Using "contain" function with two arguments in R

I have a dataset f.ex. like this:
dat1 <- read.table(header=TRUE, text="
Trust_01_T1 Trust_02_T1 Trust_03_T1 Trust_01_T2 Trust_02_T2 Trust_03_T2 Cont_01_T1 Cont_01_T2
5 1 2 1 5 3 1 1
3 1 3 3 4 2 1 2
2 1 3 1 3 1 2 2
4 2 5 5 3 2 3 3
5 1 4 1 2 2 4 5
")
I'd like to use the select function to gather the variables that contain Trust AND T1.
dat1 <- dat1 %>%
mutate(Trust_T1 = select(., contains("Trust")))
Does anybody know how to use two Arguments there, to have Trust AND T1. If I use:
dat1 <- dat1 %>%
mutate(Trust_T1 = select(., contains("Trust"), contains("T1")))
it gives me the Variables that contain EITHER Trust or T1.
best!
If we need both, then use matches with a regex to specify the column names that starts (^) with 'Trust' and ends ($) as 'T1' (assuming these are only patterns
library(dplyr)
dat1 %>%
select(matches("^Trust_.*T1$"))
The mutate used to create a new column is not clear as there are multiple columns that matches the 'Trust' followed by 'T1'. If the intention is to do some operations on the selected columns, can either be across or c_across with rowwise (not clear from the post)
One solution could be:
library(dplyr)
df %>% select(starts_with('Trust') | contains('_T1'))
#> Trust_01_T1 Trust_02_T1 Trust_03_T1 Trust_01_T2 Trust_02_T2 Trust_03_T2
#> 1 5 1 2 1 5 3
#> 2 3 1 3 3 4 2
#> 3 2 1 3 1 3 1
#> 4 4 2 5 5 3 2
#> 5 5 1 4 1 2 2
#> Cont_01_T1
#> 1 1
#> 2 1
#> 3 2
#> 4 3
#> 5 4
DATA
df <- read.table(text =
"
Trust_01_T1 Trust_02_T1 Trust_03_T1 Trust_01_T2 Trust_02_T2 Trust_03_T2 Cont_01_T1 Cont_01_T2
5 1 2 1 5 3 1 1
3 1 3 3 4 2 1 2
2 1 3 1 3 1 2 2
4 2 5 5 3 2 3 3
5 1 4 1 2 2 4 5
", header =T)

Can I use Boolean operators with R tidy select functions

is there a way I can use Boolean operators (e.g. | or &) with the tidyselect helper functions to select variables?
The code below illustrates what currently works and what, in my mind, should work but doesn't.
df<-sample(seq(1,4,1), replace=T, size=400)
df<-data.frame(matrix(df, ncol=10))
#make variable names
library(tidyverse)
library(stringr)
vars1<-str_c('q1_', seq(1,5,1))
vars2<-str_c('q9_', seq(1,5,1))
#Assign
names(df)<-c(vars1, vars2)
names(df)
#This works
df %>%
select(starts_with('q1_'), starts_with('q9'))
#This does not work using |
df %>%
select(starts_with('q1_'| 'q9_'))
#This does not work with c()
df %>%
select(starts_with(c('q1_', 'q9_')))
You can use multiple starts_with, e.g.,
df %>% select(starts_with('q1_'), starts_with('q9_'))
You can use | in a regular expression and matches() (in this case, in combination with ^, the regex beginning-of-string)
df %>% select(matches('^q1_|^q9_'))
You can also approach it using purrr:
map(.x = c("q1_", "q9_"), ~ df %>%
select(starts_with(.x))) %>%
bind_cols()
q1_1 q1_2 q1_3 q1_4 q1_5 q9_1 q9_2 q9_3 q9_4 q9_5
1 2 4 3 1 2 2 3 1 1 3
2 1 3 3 4 4 3 2 2 1 3
3 2 2 3 4 3 4 1 3 2 4
4 1 2 4 2 4 3 3 1 3 3
5 3 1 2 3 3 2 2 3 3 3
6 4 2 3 4 1 4 2 4 2 4
7 3 1 4 1 4 2 4 4 1 2
8 2 2 3 2 1 3 3 3 1 4
9 1 4 2 3 4 4 1 1 3 4
10 1 1 2 4 1 1 4 4 1 2

Combining an individual and aggregate level data sets

I've got two different data frames, lets call them "Months" and "People".
Months looks like this:
Month Site X
1 1 4
2 1 3
3 1 5
1 2 10
2 2 7
3 2 5
and People looks like this:
ID Month Site
1 1 1
2 1 2
3 1 1
4 2 2
5 2 2
6 2 2
7 3 1
8 3 2
I'd like to combine them so essentially each time an entry in "People" has a particular Month and Site combination, it's added to the appropriate aggregated data frame, so I'd get something like the following:
Month Site X People
1 1 4 2
2 1 3 0
3 1 5 1
1 2 10 1
2 2 7 3
3 2 5 1
But I haven't the foggiest idea of how to go about doing that. Any suggestions?
Using base packages
> aggregate( ID ~ Month + Site, data=People, FUN = length )
Month Site ID
1 1 1 2
2 3 1 1
3 1 2 1
4 2 2 3
5 3 2 1
> res <- merge(Months, aggdata, all.x = TRUE)
> res
Month Site X ID
1 1 1 4 2
2 1 2 10 1
3 2 1 3 NA
4 2 2 7 3
5 3 1 5 1
6 3 2 5 1
> res[is.na(res)] <- 0
> res
Month Site X ID
1 1 1 4 2
2 1 2 10 1
3 2 1 3 0
4 2 2 7 3
5 3 1 5 1
6 3 2 5 1
Assuming your data.frames are months and people, here's a data.table solution:
require(data.table)
m.dt <- data.table(months, key=c("Month", "Site"))
p.dt <- data.table(people, key=c("Month", "Site"))
# one-liner
dt.f <- p.dt[m.dt, list(X=X[1], People=sum(!is.na(ID)))]
> dt.f
# Month Site X People
# 1: 1 1 4 2
# 2: 1 2 10 1
# 3: 2 1 3 0
# 4: 2 2 7 3
# 5: 3 1 5 1
# 6: 3 2 5 1

Resources