Changing the row id conditional on a different column? - r

Hi I have a data set like this
df <- cbind(c("id",1,2,3,4,5,6,7,8,9,10,11), c("trial",1,1,1,1,1,1,2,2,2,2,2))
However, I want to change the ids conditional on the trial number - something like this
df1 <- cbind(c("id",1.1,1.2,1.3,1.4,1.5,1.6,2.1,2.2,2.3,2.4,2.5), c("trial",1,1,1,1,1,1,2,2,2,2,2))
I would really appreciate it if someone could help me with this. I am still learning R.

df <- data.frame(id = c(1,2,3,4,5,6,7,8,9,10,11), trial = c(1,1,1,1,1,1,2,2,2,2,2))
library(dplyr)
df |>
arrange(trial, id) |>
group_by(trial) |>
mutate(trial_id = row_number()) |>
ungroup() |>
mutate(id = as.numeric(paste0(trial, ".", trial_id)))
# OR use: tidyr::unite("id", c(trial, trial_id), sep = ".", remove = FALSE)
# A tibble: 11 × 3
id trial trial_id
<dbl> <dbl> <int>
1 1.1 1 1
2 1.2 1 2
3 1.3 1 3
4 1.4 1 4
5 1.5 1 5
6 1.6 1 6
7 2.1 2 1
8 2.2 2 2
9 2.3 2 3
10 2.4 2 4
11 2.5 2 5

Related

Replace 'middle' frequencies with averaged frequency values

I have this type of data, with frequency data and position data grouped by rowid:
df
rowid word f position
1 2 i 700 1
2 2 'm 600 2
3 2 fine 1 3
4 3 how 400 1
5 3 's 500 2
6 3 the 700 3
7 3 weather 20 4
8 4 it 390 1
9 4 's 500 2
10 4 really 177 3
11 4 very 200 4
12 4 cold 35 5
13 5 i 700 1
14 5 love 199 2
15 5 you 400 3
The task I'm facing seems simple: in those rowids where there are more than 3 positions, I need to replace the frequencies of all middle positions with their average. The following approach works but seems over-convoluted, so I'm almost certain there will be a more straightforward dplyrway to get the desired output:
df %>%
group_by(rowid) %>%
# filter for 'middle' positions:
filter(position != first(position) & position != last(position)) %>%
# summarise:
summarize(across(position),
# create average frequency:
f_middle_position = mean(f, na.rm = TRUE),
# concatenate words:
word = str_c(word, collapse = " ")
) %>%
filter(!duplicated(f_middle_position)) %>%
# join with df:
left_join(df, ., by = c("rowid", "position")) %>%
# remove rows other than #1,#2, and last:
group_by(rowid) %>%
# create row count:
mutate(rn = row_number()) %>%
# filter first, second, and last row per group:
filter(rn %in% c(1, 2, last(rn))) %>%
# transfer frequencies for middle positions:
mutate(f = ifelse(is.na(f_middle_position), f, f_middle_position)) %>%
# make more changes:
mutate(
# change position labels:
position = ifelse(position == first(position), 1,
ifelse(position == last(position), 2, 1.5)),
# update word:
word = ifelse(is.na(word.y), word.x, word.y)
) %>%
# remove obsolete variables:
select(-c(f_middle_position, word.y, word.x,rn))
A tibble: 12 × 4
# Groups: rowid [4]
rowid f position word
<dbl> <dbl> <dbl> <chr>
1 2 700 1 i
2 2 600 1.5 'm
3 2 1 2 fine
4 3 400 1 how
5 3 600 1.5 's the
6 3 20 2 weather
7 4 390 1 it
8 4 292. 1.5 's really very
9 4 35 2 cold
10 5 700 1 i
11 5 199 1.5 love
12 5 400 2 you
How can this result be obtained in a more concise way in dplyr and, preferably without the left_join, which causes problems with my actual data?
Data:
df <- data.frame(
rowid = c(2,2,2,3,3,3,3,4,4,4,4,4,5,5,5),
word = c("i","'m","fine",
"how","'s","the","weather",
"it","'s","really", "very","cold",
"i","love","you"),
f = c(700,600,1,
400,500,700,20,
390,500,177,200,35,
700,199,400),
position = c(1,2,3,
1,2,3,4,
1,2,3,4,5,
1,2,3)
)
You can create a group variable pos that marks the first row with 1, the middle with 1.5, and the last with 2. Then group the data by rowid and pos and apply mean() and paste() on f and word respectively.
library(dplyr)
df %>%
group_by(rowid) %>%
mutate(pos = case_when(position == 1 ~ 1, position == n() ~ 2, TRUE ~ 1.5)) %>%
group_by(rowid, pos) %>%
summarise(f = mean(f), word = paste(word, collapse = ' '), .groups = 'drop')
# # A tibble: 12 × 4
# rowid pos f word
# <dbl> <dbl> <dbl> <chr>
# 1 2 1 700 i
# 2 2 1.5 600 'm
# 3 2 2 1 fine
# 4 3 1 400 how
# 5 3 1.5 600 's the
# 6 3 2 20 weather
# 7 4 1 390 it
# 8 4 1.5 292. 's really very
# 9 4 2 35 cold
# 10 5 1 700 i
# 11 5 1.5 199 love
# 12 5 2 400 you

arranging columns based on numeric values in r

I need to arrange column names based on numbering.
Here is a short version of my dataset.
df <- data.frame(id = c(1,2,3),
raw_score = c(10,20,30),
a = c(1,1,1),
b = c(2,3,4),
c = c(4,6,7))
names(df) <- c("id","raw_score","2.2","2.3","2.1")
> df
id raw_score 2.2 2.3 2.1
1 1 10 1 2 4
2 2 20 1 3 6
3 3 30 1 4 7
How can I arrange the columns below?
> df
id raw_score 2.1 2.2 2.3
1 1 10 4 1 2
2 2 20 6 1 3
3 3 30 7 1 4
Maybe
df %>% dplyr::select(id, raw_score,stringr::str_sort(colnames(df[, 3:ncol(df)]), numeric = TRUE)) -> df

Determine percentage of rows with missing values in a dataframe in R

I have a data frame with three variables and some missing values in one of the variables that looks like this:
subject <- c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2)
part <- c(0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3)
sad <- c(1,7,7,4,NA,NA,2,2,NA,2,3,NA,NA,2,2,1,NA,5,NA,6,6,NA,NA,3,3,NA,NA,5,3,NA,7,2)
df1 <- data.frame(subject,part,sad)
I have created a new data frame with the mean values of 'sad' per subject and part using a loop, like this:
columns<-c("sad.m",
"part",
"subject")
df2<-matrix(data=NA,nrow=1,ncol=length(columns))
df2<-data.frame(df2)
names(df2)<-columns
tn<-unique(df1$subject)
row=1
for (s in tn){
for (i in 0:3){
TN<-df1[df1$subject==s&df1$part==i,]
df2[row,"sad.m"]<-mean(as.numeric(TN$sad), na.rm = TRUE)
df2[row,"part"]<-i
df2[row,"subject"]<-s
row=row+1
}
}
Now I want to include an additional variable 'missing' in that indicates the percentage of rows per subject and part with missing values, so that I get df3:
subject <- c(1,1,1,1,2,2,2,2)
part<-c(0,1,2,3,0,1,2,3)
sad.m<-df2$sad.m
missing <- c(0,50,50,25,50,50,50,25)
df3 <- data.frame(subject,part,sad.m,missing)
I'd really appreciate any help on how to go about this!
It's best to try and avoid loops in R where possible, since they can get messy and tend to be quite slow. For this sort of thing, dplyr library is perfect and well worth learning. It can save you a lot of time.
You can create a data frame with both variables by first grouping by subject and part, and then performing a summary of the grouped data frame:
df2 = df1 %>%
dplyr::group_by(subject, part) %>%
dplyr::summarise(
sad_mean = mean(na.omit(sad)),
na_count = (sum(is.na(sad) / n()) * 100)
)
df2
# A tibble: 8 x 4
# Groups: subject [2]
subject part sad_mean na_count
<dbl> <dbl> <dbl> <dbl>
1 1 0 4.75 0
2 1 1 2 50
3 1 2 2.5 50
4 1 3 1.67 25
5 2 0 5.5 50
6 2 1 4.5 50
7 2 2 4 50
8 2 3 4 25
For each subject and part you can calculate mean of sad and calculate ratio of NA value using is.na and mean.
library(dplyr)
df1 %>%
group_by(subject, part) %>%
summarise(sad.m = mean(sad, na.rm = TRUE),
perc_missing = mean(is.na(sad)) * 100)
# subject part sad.m perc_missing
# <dbl> <dbl> <dbl> <dbl>
#1 1 0 4.75 0
#2 1 1 2 50
#3 1 2 2.5 50
#4 1 3 1.67 25
#5 2 0 5.5 50
#6 2 1 4.5 50
#7 2 2 4 50
#8 2 3 4 25
Same logic with data.table :
library(data.table)
setDT(df1)[, .(sad.m = mean(sad, na.rm = TRUE),
perc_missing = mean(is.na(sad)) * 100), .(subject, part)]
Try this dplyr approach to compute df3:
library(dplyr)
#Code
df3 <- df1 %>% group_by(subject,part) %>% summarise(N=100*length(which(is.na(sad)))/length(sad))
Output:
# A tibble: 8 x 3
# Groups: subject [2]
subject part N
<dbl> <dbl> <dbl>
1 1 0 0
2 1 1 50
3 1 2 50
4 1 3 25
5 2 0 50
6 2 1 50
7 2 2 50
8 2 3 25
And for full interaction with df2 you can use left_join():
#Left join
df3 <- df1 %>% group_by(subject,part) %>%
summarise(N=100*length(which(is.na(sad)))/length(sad)) %>%
left_join(df2)
Output:
# A tibble: 8 x 4
# Groups: subject [2]
subject part N sad.m
<dbl> <dbl> <dbl> <dbl>
1 1 0 0 4.75
2 1 1 50 2
3 1 2 50 2.5
4 1 3 25 1.67
5 2 0 50 5.5
6 2 1 50 4.5
7 2 2 50 4
8 2 3 25 4

Collapsing rows with dplyr

I am new to R and am trying to collapse rows based on row values with dplyr. The following example shows the sample data.
set.seed(123)
df<-data.frame(A=c(rep(1:4,4)),
B=runif(16,min=0,max=1),
C=rnorm(16, mean=1,sd=0.5))
A B c
1 1 0.36647435 0.7485365
2 2 0.51864614 0.8654337
3 3 0.04596929 0.9858012
4 4 0.15479619 1.1294208
5 1 0.76712372 1.2460700
6 2 0.17666676 0.7402996
7 3 0.89759874 1.2699954
8 4 0.90267735 0.7101804
9 1 0.91744223 0.3451281
10 2 0.25472599 0.8604743
11 3 0.10933985 0.8696796
12 4 0.71656017 1.2648846
13 1 0.21157810 1.3170205
14 2 0.14947268 1.2789700
15 3 0.92251060 1.5696901
16 4 0.30090579 1.7642853
I want to summarize/collapse two rows based on the condition that the rows in column A with values 1 and 2 as one row (as mean of row 1 and 2) . Therefore the final result will have only 12 rows because the other 4 rows has been collapsed.
I tried to use the following dplyr function but to little avail.
install.packages ("tidyverse")
library (tidyverse)
df %>% summarize_each( fun(i){ for i %in% c(1,2)funs(mean) })
The expected output is something like:
A B C
1 1.5 0.4425602 0.8069851
3 3 0.04596929 0.9858012
4 4 0.15479619 1.1294208
5 1.5 0.4718952 0.9931848
7 3 0.89759874 1.2699954
8 4 0.90267735 0.7101804
9 1.5 0.5860841 0.6028012
11 3 0.10933985 0.8696796
12 4 0.71656017 1.2648846
13 1.5 0.1805254 1.297995
15 3 0.92251060 1.5696901
16 4 0.30090579 1.7642853
Thank you in advance.
By making the implicit, order based groupings explicit, the summary can
be done with a single summarise_all call.
# Generate the data
set.seed(1)
df <- data.frame(
A = c(rep(1:4, 4)),
B = runif(16, min = 0, max = 1),
C = rnorm(16, mean = 1, sd = 0.5)
)
library(dplyr)
new <- df %>%
group_by(grp = rep(
1:4, # vector containing names of groups to create
each = 4 # number of elements in each group
)) %>%
group_by(mean_grp = cumsum(A > 2) + 1, add = T) %>%
summarise_all(mean) %>%
ungroup()
new
#> # A tibble: 12 x 5
#> grp mean_grp A B C
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 1.5 0.3188163 1.067598241
#> 2 1 2 3.0 0.5728534 1.755890584
#> 3 1 3 4.0 0.9082078 1.194921618
#> 4 2 1 1.5 0.5500358 0.291014883
#> 5 2 2 3.0 0.9446753 1.562465459
#> 6 2 3 4.0 0.6607978 0.977533195
#> 7 3 1 1.5 0.3454502 1.231911487
#> 8 3 2 3.0 0.2059746 1.410610598
#> 9 3 3 4.0 0.1765568 1.296950661
#> 10 4 1 1.5 0.5355633 1.425278418
#> 11 4 2 3.0 0.7698414 1.037282492
#> 12 4 3 4.0 0.4976992 0.005324152
I would recommend keeping the grouping variables in your data after the
summary (everything is simpler if you include them in the first place),
but if you want to, you can drop them with
new %>% select(-grp, -mean_grp).
PS. In order to avoid having "magic numbers" (such as the 1:4 and each = 4 when creating grp) included in the code, you could also create the first grouping variable as:
grp = cumsum(A < lag(A, default = A[1])) + 1
Assuming that the original data are ordered such that a new group starts each time the value of A is less than the previous value of A.
One option would be to process the rows with A equal to 1 or 2 separately from the other rows and then bind them back together:
set.seed(3)
df<-data.frame(A=c(rep(1:4,4)),B=runif(16,min=0,max=1),c=rnorm(16, mean=1,sd=0.5))
df %>%
filter(A %in% 1:2) %>%
group_by(tmp=cumsum(A==1)) %>%
summarise_all(mean) %>%
ungroup %>% select(-tmp) %>%
bind_rows(df %>% filter(!A %in% 1:2))
A B c
<dbl> <dbl> <dbl>
1 1.5 0.4877790 1.0121278
2 1.5 0.6032474 0.8840735
3 1.5 0.6042946 0.5996850
4 1.5 0.5456424 0.6198039
5 3.0 0.3849424 0.6276092
6 4.0 0.3277343 0.4343907
7 3.0 0.1246334 1.0760229
8 4.0 0.2946009 0.8461718
9 3.0 0.5120159 1.6121568
10 4.0 0.5050239 1.0999058
11 3.0 0.8679195 0.8981359
12 4.0 0.8297087 0.1667626

summarise by group of columns using min and maintaing row number

I have a data frame with 3 columns
df <- data.frame(ID1=c(rep(1,4),rep(2,4)), ID2=rep(1:2,4), value=1:8)
I need to recover the min for each group (ID1, ID2) and the position(row.name) of this min in the original table.
Using group_by and summarise, I have obtained the min but I can't see a way to obtain the position as summarise gets rid of the columns not summarised and not used for group.
df<-data.frame(ID1=c(rep(1,4),rep(2,4)), ID2=rep(1:2,4), value=1:8)
df[['X']] <- paste0(df$ID1,'.',df$ID2)
df <- group_by( df, X )
df <- summarise( df, Objective=min(value) )
Any ideas on how to solve this to get?
X Objective Position
1 1.1 1 1
2 1.2 2 2
3 2.1 5 5
4 2.2 6 6
Thanks in advance
If I understand correct and since you're already using dplyr, you could do it like this:
library(dplyr); library(tidyr)
unite(df, X, ID1:ID2, sep = ".") %>%
mutate(Position = row_number()) %>%
group_by(X) %>% slice(which.min(value))
#Source: local data frame [4 x 3]
#Groups: X
#
# X value Position
#1 1.1 1 1
#2 1.2 2 2
#3 2.1 5 5
#4 2.2 6 6
Or alternatively (only dplyr) - I'd rather use this one:
mutate(df, Position = row_number()) %>% group_by(ID1, ID2) %>% slice(which.min(value))
#Source: local data frame [4 x 4]
#Groups: ID1, ID2
#
# ID1 ID2 value Position
#1 1 1 1 1
#2 1 2 2 2
#3 2 1 5 5
#4 2 2 6 6
data
df <- data.frame(ID1=rep(1:2, each = 4), ID2=rep(1:2,4), value=1:8)
Here's how would I approach this using data.table (rn would be your row number).
library(data.table)
setDT(df, keep.rownames = TRUE)[, .SD[which.min(value)], list(ID1, ID2)]
# ID1 ID2 rn value
# 1: 1 1 1 1
# 2: 1 2 2 2
# 3: 2 1 5 5
# 4: 2 2 6 6
Another option is ordering and then picking the unique values
unique(setorder(df, value), by = c("ID1", "ID2"))
# ID1 ID2 rn value
# 1: 1 1 1 1
# 2: 1 2 2 2
# 3: 2 1 5 5
# 4: 2 2 6 6
Both approaches don't require creating X column
Or using base R
df <- df[order(df$value), ]
df[!duplicated(df[, 1:2]), ]
# ID1 ID2 value
# 1 1 1 1
# 2 1 2 2
# 5 2 1 5
# 6 2 2 6
data
df <- data.frame(ID1=c(rep(1,4),rep(2,4)), ID2=rep(1:2,4), value=1:8)
Using Aggregate:
Data:
df<-data.frame(ID1=c(rep(1,4),rep(2,4)), ID2=rep(1:2,4), value=1:8)
df[['X']] <- paste0(df$ID1,'.',df$ID2)
df$rn<-row.names(df) #rn is the row number
df<-df[c("X","rn","value")]
#> df
# X rn value
#1 1.1 1 1
#2 1.2 2 2
#3 1.1 3 3
#4 1.2 4 4
#5 2.1 5 5
#6 2.2 6 6
#7 2.1 7 7
#8 2.2 8 8
Aggregate step:
df2<- aggregate(df, by=list(c(df$X)), min)
#> df2
# Group.1 X rn value
#1 1.1 1.1 1 1
#2 1.2 1.2 2 2
#3 2.1 2.1 5 5
#4 2.2 2.2 6 6

Resources