Replace column based on column names - r

I have 65 columns, but a sample of data could be as follows:
df<-read.table (text=" Name D A D E
Rose D D C B
Smith B A D D
Lora A A D D
Javid A D D B
Ahmed C A E A
Helen B A D D
Nadia A A D A
", header=TRUE)
I want to get the following table:
Name D A D E
Rose 2 1 1 1
Smith 1 2 2 1
Lora 1 2 2 1
Javid 1 1 2 1
Ahmed 1 2 1 1
Helen 1 2 2 1
Nadia 1 2 2 1
The numbers follow the first raw. For example, the second column is D, so all Ds should read 2 and else should read 1. Or in the third column, which is A, all As should read 2 and else should read 1 and so on. Please consider I have 65 columns. I understand I should have different names for the columns, but In this case, I cannot change them as you understand it.

With ifelse and sapply:
df[2:ncol(df)] <- sapply(2:ncol(df), function(i) ifelse(df[i] == colnames(df[i]), 2, 1))
output
#> df
Name D A D E
1 Rose 2 1 1 1
2 Smith 1 2 2 1
3 Lora 1 2 2 1
4 Javid 1 1 2 1
5 Ahmed 1 2 1 1
6 Helen 1 2 2 1
7 Nadia 1 2 2 1
data
df <- structure(list(Name = c("Rose", "Smith", "Lora", "Javid", "Ahmed",
"Helen", "Nadia"), D = c("D", "B", "A", "A", "C", "B", "A"),
A = c("D", "A", "A", "D", "A", "A", "A"), D = c("C", "D",
"D", "D", "E", "D", "D"), E = c("B", "D", "D", "B", "A",
"D", "A")), class = "data.frame", row.names = c(NA, -7L))

cols = names(df)[-1]
df[cols] = lapply(cols, \(x) (df[[x]] == x) + 1L)
# Name D A
# 1 Rose 2 1
# 2 Smith 1 2
# 3 Lora 1 2
# 4 Javid 1 1
# 5 Ahmed 1 2
# 6 Helen 1 2
# 7 Nadia 1 2
Simplified data (without repeated column names)
df <- data.frame(
Name = c("Rose", "Smith", "Lora", "Javid", "Ahmed", "Helen", "Nadia"),
D = c("D", "B", "A", "A", "C", "B", "A"),
A = c("D", "A", "A", "D", "A", "A", "A")
)

Another approach, you can stack, replace and unstack, i.e
stack_df <- stack(df[-1])
stack_df$values <- ifelse(stack_df$values == stack_df$ind, 2, 1)
cbind.data.frame(Name = df$Name, unstack(stack_df))
# Name D A E
#1 Rose 2 1 1
#2 Smith 1 2 1
#3 Lora 1 2 1
#4 Javid 1 1 1
#5 Ahmed 1 2 1
#6 Helen 1 2 1
#7 Nadia 1 2 1
DATA
structure(list(Name = c("Rose", "Smith", "Lora", "Javid", "Ahmed",
"Helen", "Nadia"), D = c("D", "B", "A", "A", "C", "B", "A"),
A = c("D", "A", "A", "D", "A", "A", "A"), E = c("B", "D",
"D", "B", "A", "D", "A")), row.names = c(NA, -7L), class = "data.frame")

dplyr option with ifelse like this:
library(dplyr)
df %>%
mutate(across(D:E, ~ifelse(. == cur_column(), 2, 1)))
#> Name D A D.1 E
#> 1 Rose 2 1 1 1
#> 2 Smith 1 2 1 1
#> 3 Lora 1 2 1 1
#> 4 Javid 1 1 1 1
#> 5 Ahmed 1 2 1 1
#> 6 Helen 1 2 1 1
#> 7 Nadia 1 2 1 1
Created on 2022-09-19 with reprex v2.0.2

Using dplyr:
The data:
df <- read.table (
text = " Name A B C D
Rose D D C B
Smith B A D D
Lora A A D D
Javid A D D B
Ahmed C A E A
Helen B A D D
Nadia A A D A
",
header = TRUE
)
> df
Name A B C D
1 Rose D D C B
2 Smith B A D D
3 Lora A A D D
4 Javid A D D B
5 Ahmed C A E A
6 Helen B A D D
7 Nadia A A D A
Note that i changed the column names
df %>%
mutate(across(!c(Name),
.fns = ~ ifelse(.x == cur_column(), 2, 1)))
Name A B C D
1 Rose 1 1 2 1
2 Smith 1 1 1 2
3 Lora 2 1 1 2
4 Javid 2 1 1 1
5 Ahmed 1 1 1 1
6 Helen 1 1 1 2
7 Nadia 2 1 1 1
The mutate-command modifies columns in dataframes. Using the across()-function we specify that the mutation should be applied to more than one column. inside the across-call, we select every column but the name column using !c(Name). We then specify a function that compares the name of the column cur_column() with the values in the column .x. If they are the same, set the value to 2, else to 1.
EDIT: used ifelse instead of case_when as there is only one condition to check

You can compare each row with the column names. Adding 1 to the logical values converts FALSE and TRUE into 1 and 2 respectively.
df[-1] <- t((t(df[-1]) == names(df)[-1]) + 1)
df
# Name D A D E
# 1 Rose 2 1 1 1
# 2 Smith 1 2 2 1
# 3 Lora 1 2 2 1
# 4 Javid 1 1 2 1
# 5 Ahmed 1 2 1 1
# 6 Helen 1 2 2 1
# 7 Nadia 1 2 2 1

Related

Calculation of the cumulative points before the event/game

I would like to cumulate the points of several football clubs for each one for a match day.
I have created a sample dataset to explain the problem:
t <- data.frame(Heim = c("A", "B", "B", "D", "C", "A", "C", "D", "A", "B", "B", "D", "C", "A", "C", "D"),
Auswärts = c("C", "D", "A", "C", "B", "D", "A", "B", "C", "D", "A", "C", "B", "D", "A", "B"),
Ergebnis= c("S", "U", "N", "N", "S", "S", "N", "U", "N", "S", "N", "U", "S", "S", "U", "U"),
Round = c(1,1,2,2,3,3,4,4,1,1,2,2,3,3,4,4),
Saison = c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2))
My idea was for each club (in the origin data set more than 4) a separate column with the score and a cummulated column to it.
So something like this:
t$A_Points <- ifelse(t$Heim =="A" & t$Ergebnis =="S", 3, 0)
t$A_Points <- ifelse(t$Heim =="A" & t$Ergebnis =="U", 1, t$A_Points )
t$A_Points <- ifelse(t$Auswärts =="A" & t$Ergebnis =="U", 1, t$A_Points )
t$A_Points <- ifelse(t$Auswärts =="A" & t$Ergebnis =="N", 3, t$A_Points )
t$A_Points <- ifelse(t$Auswärts !="A" & t$Heim !="A", NA, t$A_Points)
t$A<- ifelse(t$A_Points == "NA", 0, 1)
t<- t %>%
arrange(Saison,Round,A) %>%
group_by(Saison, A) %>%
mutate(cumsum = cumsum(A_Points))
Unfortunately, it is very time and space-consuming even for 4 clubs...
Also, I would like to have the sum of the points of the matches without the result of the current matchday.
The optimal result for me would be following:
Heim Auswärts Ergebnis Round Saison Points_Heim Points_Auswärts
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 A C S 1 1 0 0
2 B D U 1 1 0 0
3 B A N 2 1 1 3
4 D C N 2 1 1 0
5 A D S 3 1 6 1
6 C B S 3 1 3 1
7 C A N 4 1 6 9
8 D B U 4 1 1 1
9 A C N 1 2 0 0
10 B D S 1 2 0 0
11 B A N 2 2 3 0
12 D C U 2 2 0 3
13 A D S 3 2 3 1
14 C B S 3 2 4 3
15 C A U 4 2 7 3
16 D B U 4 2 1 3
I would be very happy about an idea for an easier solution.
Probably not the shortest solution. But I would do
t <- t %>%
group_by(Saison) %>%
mutate(Heim_Points_Veränderung = case_when(Ergebnis == "S" ~ 3,
Ergebnis == "U" ~ 1,
Ergebnis == "N" ~ 0),
Auswärts_Points_Veränderung = case_when(Ergebnis == "S" ~ 0,
Ergebnis == "U" ~ 1,
Ergebnis == "N" ~ 3),
Points_Heim = 0,
Points_Auswärts = 0)
for (i in unique(union(t$Heim, t$Auswärts))){
t <- t %>%
mutate(!!sym(paste0(i,"_points")) := if_else(Heim == i, Heim_Points_Veränderung, 0),
!!sym(paste0(i,"_points")) := if_else(Auswärts == i, Auswärts_Points_Veränderung, !!sym(paste0(i,"_points"))),
!!sym(paste0(i,"_cumsum")) := cumsum(lag(!!sym(paste0(i,"_points")), default=0)),
Points_Heim = if_else(Heim == i, !!sym(paste0(i,"_cumsum")), Points_Heim),
Points_Auswärts = if_else(Auswärts == i, !!sym(paste0(i,"_cumsum")), Points_Auswärts))
}
t <- t %>%
select(Heim, Auswärts, Ergebnis, Round, Saison, Points_Heim, Points_Auswärts)
Output
> t
# A tibble: 16 x 7
# Groups: Saison [2]
Heim Auswärts Ergebnis Round Saison Points_Heim Points_Auswärts
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 A C S 1 1 0 0
2 B D U 1 1 0 0
3 B A N 2 1 1 3
4 D C N 2 1 1 0
5 C B S 3 1 3 1
6 A D S 3 1 6 1
7 C A N 4 1 6 9
8 D B U 4 1 1 1
9 A C N 1 2 0 0
10 B D S 1 2 0 0
11 B A N 2 2 3 0
12 D C U 2 2 0 3
13 C B S 3 2 4 3
14 A D S 3 2 3 1
15 C A U 4 2 7 6
16 D B U 4 2 1 3
This solution should adapt to any number of clubs. Briefly, I store the possible change in points in Heim/Auswärts_Points_Veränderung using case_when (easier than a lot of ifelse) so I can create a column of points change for each club (running a for loop on all the clubs). This allows me to do a cumsum like you using lag to make sure the cumsum is updated 1 row later (to display the cumulative points before the match and not after), which I enter in the Points_Heim/Auswärts column only when the club is displayed in the Heim/Auswärts columns. The key to my solution is to use !!sym to feed dynamic variable names to mutate (note the assignment with :=) inside the for loop.

Splitting single column into four columns and count repeated pattern in R

Aim of this project is understand how information is acquired while looking into an object. Imagine an object has elements like a, b, c, d, e and f. A person might look at a and move onto to b and so forth. Now, we wish to plot and understand how that person have navigated across the different elements of a given stimuli. I have data that captured this movement in a single column but I need split this into few columns to get the navigation pattern. Please find the example given below.
I have column extracted from a data frame. Now it has to be split into four columns based on its characteristics.
a <- c( "a", "b", "b", "b", "a", "c", "a", "b", "d", "d", "d", "e", "f", "f", "e", "e", "f")
a <- as.data.frame(a)
Expected output
from to countfrom countto
a b 1 3
b a 3 1
a c 1 1
c a 1 1
a b 1 1
b d 1 3
d e 3 1
e f 1 2
f e 2 2
e f 2 1
Note: I used dplyr to extract from the dataframe.
Use rle to get the relative runs of each letter, and then piece it together:
r <- rle(a$a)
## or maybe `r <- rle(as.character(a$a)` depending on your R version
setNames(
data.frame(lapply(r, head, -1), lapply(r, tail, -1)),
c("countfrom","from","countto","to")
)
## countfrom from countto to
##1 1 a 3 b
##2 3 b 1 a
##3 1 a 1 c
##4 1 c 1 a
##5 1 a 1 b
##6 1 b 3 d
##7 3 d 1 e
##8 1 e 2 f
##9 2 f 2 e
##10 2 e 1 f
Or in the tidyverse
library(tidyverse)
a <- c( "a", "b", "b", "b", "a", "c", "a", "b", "d",
"d", "d", "e", "f", "f", "e", "e", "f")
foo <- rle(a)
answ <- tibble(from = foo$values, to = lead(foo$values),
fromCount = foo$lengths, toCount = lead(foo$lengths)) %>%
filter(!is.na(to))
# A tibble: 10 x 4
from to fromCount toCount
<chr> <chr> <int> <int>
1 a b 1 3
2 b a 3 1
3 a c 1 1
4 c a 1 1
5 a b 1 1
6 b d 1 3
7 d e 3 1
8 e f 1 2
9 f e 2 2
10 e f 2 1

Enumerate a grouping variable in a tibble

I would like to know how to use row_number or anything else to transform a variable group into a integer
tibble_test <- tibble(A = letters[1:10], group = c("A", "A", "A", "B", "B", "C", "C", "C", "C", "D"))
# to get the enumeration inside each group of 'group'
tibble_test %>%
group_by(group) %>%
mutate(G1 = row_number())
But I would like to have this output:
# A tibble: 10 x 4
A group G1 G2
<chr> <chr> <dbl> <dbl>
1 a A 1 1
2 b A 2 1
3 c A 3 1
4 d B 1 2
5 e B 2 2
6 f C 1 3
7 g C 2 3
8 h C 3 3
9 i C 4 3
10 j D 1 4
My question is: how to get this column G2, I know i could transform the 'group' var into a factor then integer (after the tibble is arranged) but I would like to know if it can be done using a counting.
You just need one more step and include the group indices with group_indices(). Be aware that how your data is arranged/sorted will affect the index.
library(dplyr)
tibble_test <- tibble(A = letters[1:10], group = c("A", "A", "A", "B", "B", "C", "C", "C", "C", "D"))
# to get the enumeration inside each group of 'group'
tibble_test %>%
group_by(group) %>%
mutate(G1 = row_number(),
G2 = group_indices())
# A tibble: 10 x 4
# Groups: group [4]
A group G1 G2
<chr> <chr> <int> <int>
1 a A 1 1
2 b A 2 1
3 c A 3 1
4 d B 1 2
5 e B 2 2
6 f C 1 3
7 g C 2 3
8 h C 3 3
9 i C 4 3
10 j D 1 4

How to add a boolean value to a column when 2 different dataframes match on 2 columns in R?

I have 2 different dataframes. I want add a column to my second dataframe and have it assigned a value 0 or 1. In the case where df1$code == df2$code & df1$date == df2$date I want a 0 for these rows. A visual and reproducible example maybe makes it more easy to understand.
df1 <- data.frame(code = c("A", "B", "C", "D"), date = c(1,2,3,4))
df2 <- data.frame(code = c("A", "B", "E", "R", "V", "F"), date = c(1,2,3,4,5,6))
df3 <- data.frame(code = c("A", "B", "E", "R", "V", "F"), date = c(1,2,3,4,5,6), value =c(1,1,0,0,0,0))
DF1
code date
1 A 1
2 B 2
3 C 3
4 D 4
DF2
code date
1 A 1
2 B 2
3 E 3
4 R 4
5 V 5
6 F 6
The resulting DF I want
code date value
1 A 1 1
2 B 2 1
3 E 3 0
4 R 4 0
5 V 5 0
6 F 6 0
We can use %in% to create a logical vector and then coerce it to binary with as.integer or +
df2$value <- +(df2$code %in% df1$code)
df2
# code date value
#1 A 1 1
#2 B 2 1
#3 E 3 0
#4 R 4 0
#5 V 5 0
#6 F 6 0
I would do it like this:
df2 %>% left_join(mutate(df1, value = 1)) %>%
mutate(value = coalesce(value, 0))
# Joining, by = c("code", "date")
# code date value
# 1 A 1 1
# 2 B 2 1
# 3 E 3 0
# 4 R 4 0
# 5 V 5 0
# 6 F 6 0

How to add a row to data frame based on a condition

I've a dataframe which I want to add a row on the basis of the following conditions. The conditions are when column a is equal to C and column b is equal to 3 or 5.
Here is my dataframe
df <- data.frame(a = c("A", "B", "C", "D", "C", "A", "C", "E"),
b = c(seq(8)), stringsAsFactors = TRUE)
Whenever the condition is TRUE I want to add a row below where the condition is met add 3. I have tried the following
rbind(df, data.frame(a="add", b = "3"))
# a b
# 1 A 1
# 2 B 2
# 3 C 3
# 4 D 4
# 5 C 5
# 6 A 6
# 7 C 7
# 8 E 8
# 9 add 3
This is not the output I want. The output I want is
# a b
# 1 A 1
# 2 B 2
# 3 C 3
# 4 add 3
# 5 D 4
# 6 C 5
# 7 add 3
# 8 A 6
# 9 C 7
# 10 E 8
How can I do that? I am new to R and thank you for your help.
lens = ifelse(df$b %in% c(3, 5) & df$a == "C", 2, 1)
ind = rep(1:NROW(df), lens)
df2 = df[ind,]
df2$a = as.character(df2$a)
df2$a[cumsum(lens)[which(lens == 2)]] = "add"
df2$b[cumsum(lens)[which(lens == 2)]] = 3
df2
# a b
#1 A 1
#2 B 2
#3 C 3
#3.1 add 3
#4 D 4
#5 C 5
#5.1 add 3
#6 A 6
#7 C 7
#8 E 8
A solution using the tidyverse package.
library(tidyverse)
df2 <- df %>%
mutate(Group = lag(cumsum(a == "C" & b %in% c(3, 5)), default = FALSE)) %>%
group_split(Group) %>%
map_dfr(~ .x %>% bind_rows(tibble(a = "add", b = 3))) %>%
slice(-n()) %>%
select(-Group)
df2
# # A tibble: 10 x 2
# a b
# <chr> <dbl>
# 1 A 1
# 2 B 2
# 3 C 3
# 4 add 3
# 5 D 4
# 6 C 5
# 7 add 3
# 8 A 6
# 9 C 7
# 10 E 8
In base R, we can find out position where a = "c" and b is 3 or 5. Repeat those rows in the dataframe and replace them with required values.
pos <- which(df$a == "C" & df$b %in% c(3, 5))
df <- df[sort(c(seq(nrow(df)), pos)), ]
df[seq_along(pos) + pos, ] <- list("add", 3)
row.names(df) <- NULL
df
# a b
#1 A 1
#2 B 2
#3 C 3
#4 add 3
#5 D 4
#6 C 5
#7 add 3
#8 A 6
#9 C 7
#10 E 8
data
df <- data.frame(a = c("A", "B", "C", "D", "C", "A", "C", "E"),
b = c(seq(8)), stringsAsFactors = FALSE)

Resources