access first row of group_by dataset - r

I have a dataframedf1 with columns a,b,c. I want to assign c=0 to the first row of the dataset returned by group_by(a,b). I tried something like
t <- df1 %>% group_by(a,b) %>% filter(row_number(a)==1) %>% mutate(c= 0)
But it reduced number of rows. Expected output is
a b c
a1 b1 0
a1 b1 NA
a2 b2 0
a2 b2 NA

You can use seq_along to number elements in each group from 1 to the total number of elements within each group (2, in this case). Then use ifelse to set the first element of 'c' for each group to be 0 and leave the other element as is.
library(dplyr)
df %>%
group_by(a, b) %>%
mutate(c = ifelse(seq_along(c) == 1, 0, c))
# A tibble: 4 x 3
# Groups: a, b [2]
# a b c
# <fct> <fct> <dbl>
#1 a1 b1 0.
#2 a1 b1 NA
#3 a2 b2 0.
#4 a2 b2 NA
data
df <- data.frame(a = rep(c("a1", "a2"), each = 2),
b = rep(c("b1", "b2"), each = 2),
c = NA)
df
# a b c
#1 a1 b1 NA
#2 a1 b1 NA
#3 a2 b2 NA
#4 a2 b2 NA

Related

Update value if two columns are same

I have the following data
a <- c("A1","A1","A2","A2")
b <- c("B1","B1","B2","B2")
val <- c(10,20,30,40)
df <- data.frame(a,b,val)
I want to replace the the values in 'val' when the a = b and 'val' should have the value of the initial row
You may try
library(dplyr)
df %>%
group_by(a,b) %>%
mutate(val = first(val))
a b val
<chr> <chr> <dbl>
1 A1 B1 10
2 A1 B1 10
3 A2 B2 30
4 A2 B2 30

Finding category values that ara minimum for some variable - R

Let I have the below data frame(df):
x=c("a1","a2","a3","b1","b2","b3")
y1=c(4,2,1,1,5,8)
y2=c(7,1,9,3,2,10)
df<-data.frame(x,y1,y2)
Namely:
> df
x y1 y2
1 a1 4 7
2 a2 2 1
3 a3 1 9
4 b1 1 3
5 b2 5 2
6 b3 8 10
I want to find the value of x which is mininmum of for both y1 and y2 by group of x.
I want to reach the below output for df:
y1 y2
a3 a2
b1 b2
How can I reach that recult? My original data is much bigger.
Thanks a lot.
You don't have a clear group column defined we can create one first. For the example shown we can remove all the numbers from x column and use that as a group column. For each group we can then find out minimum value in the column and get corresponding x value of it.
library(dplyr)
df %>%
group_by(group = sub('\\d+', '', x)) %>%
summarise(across(y1:y2, ~x[which.min(.)]))
# group y1 y2
# <chr> <chr> <chr>
#1 a a3 a2
#2 b b1 b2
We could use
library(stringr)
library(dplyr)
df %>%
group_by(grp = str_remove(x, "\\d+")) %>%
summarise(across(where(is.numeric), ~ x[which.min(.)]))
# A tibble: 2 x 3
# grp y1 y2
# <chr> <chr> <chr>
#1 a a3 a2
#2 b b1 b2
A data.table option
> setDT(df)[, lapply(.SD, function(v) x[which.min(v)]), .(grp = gsub("\\d", "", x))]
grp y1 y2
1: a a3 a2
2: b b1 b2

Is there a way to change data frame entries in R from numeric to a specific character?

If I have a data frame like so:
df <- data.frame(
a = c(1,1,1,2,2,2,3,3,3),
b = c(1,2,3,1,2,3,1,2,3)
)
which looks like this:
> df
a b
1 1
1 2
1 3
2 1
2 2
2 3
3 1
3 2
3 3
Is there a quick way to change the columns a and b to match the example below, without explicitly having to type it all out?
> df
a b
a1 b1
a1 b2
a1 b3
a2 b1
a2 b2
a2 b3
a3 b1
a3 b2
a3 b3
In other words, Im trying to take the name of the column and just place it in front of the value that was in that row originally.
We can use cur_column to return the corresponding column name within across and paste (str_c) the column value with the corresponding column name
library(dplyr)
library(stringr)
df1 <- df %>%
mutate(across(everything(), ~ str_c(cur_column(), .)))
-output
df1
# a b
#1 a1 b1
#2 a1 b2
#3 a1 b3
#4 a2 b1
#5 a2 b2
#6 a2 b3
#7 a3 b1
#8 a3 b2
#9 a3 b3
Or using base R
df[] <- Map(paste0, names(df), df)
Or another option is
df[] <- paste0(names(df)[col(df)], unlist(df))

In R is there a way to recode the columns from one data frame with values from another data frame?

I am still relatively new to working in R and I am not sure how to approach this problem. Any help or advice is greatly appreciated!!!
The problem I have is that I am working with two data frames and I need to recode the first data frame with values from the second. The first data frame (df1) contains the data from the respondents to a survey and the other data frame(df2) is the data dictionary for df1.
The data looks like this:
df1 <- data.frame(a = c(1,2,3),
b = c(4,5,6),
c = c(7,8,9))
df2 <- data.frame(columnIndicator = c("a","a","a","b","b","b","c","c","c" ),
df1_value = c(1,2,3,4,5,6,7,8,9),
new_value = c("a1","a2","a3","b1","b2","b3","c1","c2","c3"))
So far I can manually recode df1 to get the expected output by doing this:
df1 <- within(df1,{
a[a==1] <- "a1"
a[a==2] <- "a2"
a[a==3] <- "a3"
b[b==4] <- "b4"
b[b==5] <- "b5"
b[b==6] <- "b6"
c[c==7] <- "c7"
c[c==8] <- "c8"
c[c==9] <- "c9"
})
However my real dataset has about 42 columns that need to be recoded and that method is a little time intensive. Is there another way in R for me to recode the values in df1 with the values in df2?
Thanks!
Just need to transform the shape a bit.
library(data.table)
df1 <- data.frame(a = c(1,2,3),
b = c(4,5,6),
c = c(7,8,9))
df2 <- data.frame(columnIndicator = c("a","a","a","b","b","b","c","c","c" ),
df1_value = c(1,2,3,4,5,6,7,8,9),
new_value = c("a1","a2","a3","b4","b5","b6","c7","c8","c9"),stringsAsFactors = FALSE)
setDT(df1)
setDT(df2)
df1[,ID:=.I]
ldf1 <- melt(df1,measure.vars = c("a","b","c"),variable.name = "columnIndicator",value.name = "df1_value")
ldf1[df2,"new_value":=i.new_value,on=.(columnIndicator,df1_value)]
ldf1
#> ID columnIndicator df1_value new_value
#> 1: 1 a 1 a1
#> 2: 2 a 2 a2
#> 3: 3 a 3 a3
#> 4: 1 b 4 b4
#> 5: 2 b 5 b5
#> 6: 3 b 6 b6
#> 7: 1 c 7 c7
#> 8: 2 c 8 c8
#> 9: 3 c 9 c9
dcast(ldf1,ID~columnIndicator,value.var = "new_value")
#> ID a b c
#> 1: 1 a1 b4 c7
#> 2: 2 a2 b5 c8
#> 3: 3 a3 b6 c9
Created on 2020-04-18 by the reprex package (v0.3.0)
In base R, we can unlist df1 match it with df1_value and get corresponding new_value.
df1[] <- df2$new_value[match(unlist(df1), df2$df1_value)]
df1
# a b c
#1 a1 b1 c1
#2 a2 b2 c2
#3 a3 b3 c3
Is this what you are looking for???
library(dplyr)
df3 <- df1 %>% gather(key = "key", value = "value")
df3 %>% inner_join(df2, by = c("key" = "columnIndicator", "value" = "df1_value"))
Output
key value new_value
1 a 1 a1
2 a 2 a2
3 a 3 a3
4 b 4 b1
5 b 5 b2
6 b 6 b3
7 c 7 c1
8 c 8 c2
9 c 9 c3

Combining columns, while ignoring duplicates and NAs

I have a dataframe as follows and I would like to combine two columns, namely Var1 and Var2. I want the combined column (Var3) to contain no duplicates of <alpha><digit>. i.e. if Var1 == A1 and Var2 == A1, hence Var3 == A1 but not Var3 == A1-A1 or if Var1 == A4-E9 and Var2 == A4, hence Var3 == A4-E9 but not Var3 == A4-E9-A4
df <- read.table(header = TRUE, text =
"id Var1 Var2
A A1 A1
B F2 A2
C NA A3
D A4-E9 A4
E E5 A5
F NA NA
G B2-R4 A3-B2
H B3-B4 E1-G5", stringsAsFactors = FALSE)
The following is my code. I would like to improve on its readability as well as get rid of the NA that is present in row 3's entry for Var3, i.e A3-NA.
library(dplyr)
library(tidyr)
df %>%
mutate(Var3 = paste(Var1, Var2, sep = "-")) %>%
separate_rows(Var3, sep = "-") %>%
group_by(id, Var3) %>%
slice(1) %>%
group_by(id) %>%
mutate(Var3 = paste(unlist(Var3[!is.na(Var3)]), collapse = "-")) %>%
slice(1) %>%
ungroup
Here's my desired output:
# A tibble: 8 x 4
id Var1 Var2 Var3
<chr> <chr> <chr> <chr>
1 A A1 A1 A1
2 B F2 A2 A2-F2
3 C <NA> A3 A3
4 D A4-E9 A4 A4-E9
5 E E5 A5 A5-E5
6 F <NA> <NA> <NA>
7 G B2-R4 A3-B2 A3-B2-R4
8 H B3-B4 E1-G5 B3-B4-E1-G5
if 'df1' is the output, then we remove the 'NA' that follows a - with sub
df1 %>%
mutate(Var3 = sub("-NA", "", Var3))
# A tibble: 8 x 4
# id Var1 Var2 Var3
# <chr> <chr> <chr> <chr>
#1 A A1 A1 A1
#2 B F2 A2 A2-F2
#3 C <NA> A3 A3
#4 D A4-E9 A4 A4-E9
#5 E E5 A5 A5-E5
#6 F <NA> <NA> NA
#7 G B2-R4 A3-B2 A3-B2-R4
#8 H B3-B4 E1-G5 B3-B4-E1-G5
We can also do this slightly differently with tidyverse by gather into 'long' format, then split the 'value' column using separate_rows, grouped by 'id', summarise the 'Var3' column by pasteing the sorted unique elements of 'Var3' and left_join with the original dataset 'df'
library(tidyverse)
gather(df, key, value, -id) %>%
separate_rows(value) %>%
group_by(id) %>%
summarise(Var3 = paste(sort(unique(value)), collapse='-')) %>%
mutate(Var3 = replace(Var3, Var3=='', NA)) %>%
left_join(df, .)
# id Var1 Var2 Var3
#1 A A1 A1 A1
#2 B F2 A2 A2-F2
#3 C <NA> A3 A3
#4 D A4-E9 A4 A4-E9
#5 E E5 A5 A5-E5
#6 F <NA> <NA> <NA>
#7 G B2-R4 A3-B2 A3-B2-R4
#8 H B3-B4 E1-G5 B3-B4-E1-G5
NOTE: The %>% makes even a simple code to appear in multiple lines, but if required, we can put all those statements in a single line and term as one-liner
Here is a one-liner
library(data.table)
setDT(df)[, Var3 := paste(sort(unique(unlist(strsplit(unlist(.SD),"-")))), collapse="-"), id]
You could do it in one line
df$Var3 = lapply(strsplit(paste(df$Var1, df$Var2, sep = "-"),"-"),
function(x)paste(unique(x)[unique(x)!="NA"],collapse="-"))
Output:
id Var1 Var2 Var3
1 A A1 A1 A1
2 B F2 A2 F2-A2
3 C <NA> A3 A3
4 D A4-E9 A4 A4-E9
5 E E5 A5 E5-A5
6 F <NA> <NA>
7 G B2-R4 A3-B2 B2-R4-A3
8 H B3-B4 E1-G5 B3-B4-E1-G5
The first part in the lapply function is similar to your first call with dplyr. First the columns are concatenated, and then we split them again.
The function within lapply removes all NA's, and then collapses the string again.
Hope this helps!
EDIT: Speed comparison for fun!
262,144 rows
Average runtimes:
Florian: 3.97 seconds
Sotos: 2.46 seconds
Akrun: 1.34 seconds
Adamm: >120 seconds
df <- read.table(header = TRUE, text =
"id Var1 Var2
A A1 A1
B F2 A2
C NA A3
D A4-E9 A4
E E5 A5
F NA NA
G B2-R4 A3-B2
H B3-B4 E1-G5", stringsAsFactors = FALSE)
for(i in 1:15)
{
df = rbind(df,df)
}
library(microbenchmark)
# Florian's method
microbenchmark(
lapply(strsplit(paste(df$Var1, df$Var2, sep = "-"),"-"),
function(x)paste(unique(x)[unique(x)!="NA"],collapse="-")),times=5)
# Sotos'method
microbenchmark(
gsub('NA-|-NA', '', vapply(strsplit(do.call(paste, df[-1]), " |-"), function(i) paste(unique(i), collapse = "-"), character(1L))), times=5)
# akrun method
library(data.table)
microbenchmark(
setDT(df)[, Var3 := paste(sort(unique(unlist(strsplit(unlist(.SD),"-")))), collapse="-"), id], times=5)
# Adamm method
microbenchmark(
sapply(1:nrow(df), function(i) ifelse(df[i,2]!=df[i,3] & !is.na(df[i,2]) & !is.na(df[i,3]), paste(df[i,2], df[i,3], sep="-"), ifelse(!is.na(df[i,3]), df[i,3], df[i,2]))), times=5)
If you want complex solution; long one-liner, nested ifelse().
df$Var3 <- sapply(1:nrow(df), function(i) ifelse(df[i,2]!=df[i,3] & !is.na(df[i,2]) & !is.na(df[i,3]), paste(df[i,2], df[i,3], sep="-"), ifelse(!is.na(df[i,3]), df[i,3], df[i,2])))
> df
id Var1 Var2 Var3
1 A A1 A1 A1
2 B F2 A2 F2-A2
3 C <NA> A3 A3
4 D A4-E9 A4 A4-E9-A4
5 E E5 A5 E5-A5
6 F <NA> <NA> <NA>
7 G B2-R4 A3-B2 B2-R4-A3-B2
8 H B3-B4 E1-G5 B3-B4-E1-G5
In case of efficiency I made a small experiment and I measured time of each proposed solution, here are the results:
First of all I need more rows:
n <- 10000
df <- do.call("rbind", replicate(n, df, simplify = FALSE))
Akrun solution 1 with tidyverse
Time difference of 1.452809 secs
Akrun solution 2 with data.table
Time difference of 0.4530261 secs
Florian Maas solution with lapply
Time difference of 1.812106 secs
My solution with sapply
Time difference of 2.289345 mins
Sotos solution
Time difference of 1.515296 secs

Resources