Generate unique dyad identifiers for unorder pairs - r

The dataframe I am working on is coded in dyadic format where each observation (i.e., row) contains a source node (from) and a target node (to) along with other some dyadic covariates (such as dyadic correlation, corr).
For simplicity sake, I want to treat each dyad as un-ordered and generate a unique identifier for each dyad like the one (i.e., df1) elow:
# original data
df <- data.frame(
from = c("A", "A", "A", "B", "C", "A", "D", "E", "F", "B"),
to = c("B", "C", "D", "C", "B", "B", "A", "A", "A", "A"),
corr = c(0.5, 0.7, 0.2, 0.15, 0.15, 0.5, 0.2, 0.45, 0.54, 0.5))
from to corr
1 A B 0.50
2 A C 0.70
3 A D 0.20
4 B C 0.15
5 C B 0.15
6 A B 0.50
7 D A 0.20
8 E A 0.45
9 F A 0.54
10 B A 0.50
# desired format
df1 <- data.frame(
from = c("A", "A", "A", "B", "C", "A", "D", "E", "F", "B"),
to = c("B", "C", "D", "C", "B", "B", "A", "A", "A", "A"),
corr = c(0.5, 0.7, 0.2, 0.15, 0.15, 0.5, 0.2, 0.45, 0.54, 0.5),
dyad = c(1, 2, 3, 4, 4, 1, 3, 5, 6, 1))
from to corr dyad
1 A B 0.50 1
2 A C 0.70 2
3 A D 0.20 3
4 B C 0.15 4
5 C B 0.15 4
6 A B 0.50 1
7 D A 0.20 3
8 E A 0.45 5
9 F A 0.54 6
10 B A 0.50 1
where dyad A-B/B-A, A-D/D-A are treated as identical pairs and are assigned with the same dyad identifiers.
While it's easy to extract a list of un-ordered pairs from the original data, it's hard to map them onto the original dataframe to generate un-ordered dyad identifiers. Could anyone offer some insights on this?

One dplyr option could be:
df %>%
mutate(dyad = group_indices(., paste0(pmax(from, to), pmin(from, to))))
from to corr dyad
1 A B 0.50 1
2 A C 0.70 2
3 A D 0.20 4
4 B C 0.15 3
5 C B 0.15 3
6 A B 0.50 1
7 D A 0.20 4
8 E A 0.45 5
9 F A 0.54 6
10 B A 0.50 1
Or:
df %>%
mutate(dyad = dense_rank(paste0(pmax(from, to), pmin(from, to))))
However, if you need to assign the identifiers in a specific order (meaning that the identifiers hold some information on their own), then the solution from #Ronak Shah could be better for you.

One way using apply could be to sort and paste the value in two column, convert them to factor and then integer to get a unique number for each combination.
df$temp <- apply(df[1:2], 1, function(x) paste(sort(x), collapse = "_"))
df$dyad <- as.integer(factor(df$temp, levels = unique(df$temp)))
df$temp <- NULL
df
# from to corr dyad
#1 A B 0.50 1
#2 A C 0.70 2
#3 A D 0.20 3
#4 B C 0.15 4
#5 C B 0.15 4
#6 A B 0.50 1
#7 D A 0.20 3
#8 E A 0.45 5
#9 F A 0.54 6
#10 B A 0.50 1

Related

Reshaping columns to rows and have apercentage

I have a data like this:
structure(list(A = c("a", "b", "c", "c", "c", "b", "a", "b"),
B = c("b", "b", "c", "a", "b", "c", "c", "a"), C = c("c",
"c", "c", "a", "a", "a", "b", "b"), D = c("a", "b", "c",
"c", "c", "a", "b", "b"), group = c("x", "y", "x", "x", "x",
"y", "y", "y")), class = "data.frame", row.names = c(NA,
-8L))
I want to reshape it that columns' name shift to rows, and also have a percentage (per column) of stack in each group based on facet name.
The desire data is like this (per column is not accurate) :
facet group stack per
1 A x a 2.1
2 A y b 4.2
3 A x c 10.2
4 A y a 20.2
5 A x b 5.6
6 A y c 11.7
7 B x a 5.4
8 B y b 17.7
9 B x c 9.0
10 B y a 14.7
11 B x b 3.2
12 B y c 13.5
13 C x a 8.8
14 C y b 11.5
15 C x c 0.7
16 C y a 7.3
17 C x b 6.8
18 C y c 5.4
19 D x a 7.9
20 D y b 12.2
21 D x c 16.1
Perhaps something like this using dplyr and tidyr? First pivot_longer, then get group/facet/stack counts, and divide those counts by the sum of such counts (within group/facet).
library(dplyr)
library(tidyr)
pivot_longer(data, -group, names_to="facet", values_to = "stack") %>%
group_by(facet,group,stack) %>%
summarize(per =n()) %>%
mutate(per =per/sum(per))
Output:
facet group stack per
<chr> <chr> <chr> <dbl>
1 A x a 0.25
2 A x c 0.75
3 A y a 0.25
4 A y b 0.75
5 B x a 0.25
6 B x b 0.5
7 B x c 0.25
8 B y a 0.25
9 B y b 0.25
10 B y c 0.5
11 C x a 0.5
12 C x c 0.5
13 C y a 0.25
14 C y b 0.5
15 C y c 0.25
16 D x a 0.25
17 D x c 0.75
18 D y a 0.25
19 D y b 0.75

Subset row in df after the first row containing a specific value

Hello I have a df such as
COL1 COL2
A OKI
B OKO
C OKU
D OKP
E BRUT
F 0.87
G 0.82
H 0.57
and I would like to subset the df for all line after the "BRUT" row
and get :
COL1 COL2
F 0.87
G 0.82
H 0.57
You can use match to get the line with BRUT, add 1 and create a sequence until nrow(x) to subset x to get all lines after BRUT.
x[(match("BRUT", x$COL2)+1):nrow(x),]
# COL1 COL2
#6 F 0.87
#7 G 0.82
#8 H 0.57
Or using tail, as suggested by #thelatemail (Thanks!).
tail(x, -match("BRUT",x$COL2))
Or some other alternatives:
x[-(1:match("BRUT", x$COL2)),]
x[-seq_len(match("BRUT", x$COL2)),]
It seems you only want the numeric values. In this case a more robust solution can be,
df[grepl('[0-9]', df$COL2),]
# COL1 COL2
#6 F 0.87
#7 G 0.82
#8 H 0.57
You can use which.max to get row number for first value of "BRUT".
df[(which.max(df$COL2 == 'BRUT') + 1):nrow(df), ]
# COL1 COL2
#6 F 0.87
#7 G 0.82
#8 H 0.57
Some other options comparing with row number :
df[seq_len(nrow(df)) > which.max(df$COL2 == 'BRUT'), ]
Using dplyr :
library(dplyr)
df %>% filter(row_number() > which.max(COL2 == 'BRUT'))
data
df <- structure(list(COL1 = c("A", "B", "C", "D", "E", "F", "G", "H"
), COL2 = c("OKI", "OKO", "OKU", "OKP", "BRUT", "0.87", "0.82",
"0.57")), class = "data.frame", row.names = c(NA, -8L))
Another option with cumsum in base R
subset(df, cumsum(cumsum(COL2 == "BRUT")) >1)
# COL1 COL2
#6 F 0.87
#7 G 0.82
#8 H 0.57
data
df <- structure(list(COL1 = c("A", "B", "C", "D", "E", "F", "G", "H"
), COL2 = c("OKI", "OKO", "OKU", "OKP", "BRUT", "0.87", "0.82",
"0.57")), class = "data.frame", row.names = c(NA, -8L))

How to insert one column of data from one table into another?

I am trying to add this table:
# [,1]
#[1,] -0.870 8
#[2,] -0.750 7
#[3,] 2.290 2
#[4,] -0.050 5
#[5,] 0.355 4
#[6,] -0.895 9
#[7,] 3.290 1
#[8,] -0.510 6
#[9,] 0.430 3
#[10,] -3.290 10
Into this the respective "predAwayScore" and "predHomeScore" columns in my data frame.
I want to insert the left hand column of the first data set (-.87, -.75, etc.) into the appropriate cells. The right hand side of that data set (8,7,2,etc.) corresponds to the letter on the data frame that the value needs to be entered. (For instance, AwayTeam E = 5 = -.05)
I am unsure how to insert one column into another data frame, and how to refer to the corresponding letter guide that is attached.
I appreciate any help.
One option is to create a named vector, use that to match the 'AwayTeam', 'HomeTeam' values to get the corresponding scores and assign those values to the columns 'predAwayScore', 'predHomeScore'
nm1 <- setNames(m1[,1], LETTERS[m1[,2]])
df1$predAwayScore <- nm1[df1[['AwayTeam']]]
df1$predHomeScore <- nm1[df1[['HomeTeam']]]
df1
# Week AwayTeam HomeTeam predAwayScore predHomeScore
#1 3 E A -0.050 3.29
#2 3 A F 3.290 -0.51
#3 4 H E -0.870 -0.05
#4 4 I A -0.895 3.29
#5 5 F C -0.510 0.43
#6 5 F J -0.510 -3.29
data
m1 <- structure(c(-0.87, -0.75, 2.29, -0.05, 0.3555, -0.895, 3.29,
-0.51, 0.43, -3.29, 8, 7, 2, 5, 4, 9, 1, 6, 3, 10), .Dim = c(10L,
2L))
df1 <- structure(list(Week = c(3, 3, 4, 4, 5, 5), AwayTeam = c("E",
"A", "H", "I", "F", "F"), HomeTeam = c("A", "F", "E", "A", "C",
"J")), class = "data.frame", row.names = c(NA, -6L))

Conditional replacement while match on a variable

I want to replace the NA values for observations within a particular sub-group, but the sequence of the observations in that group is not ordered properly. So I am wondering if there exists some dplyr or plyr command that would allow me to replace missing values in a column belonging to one dataframe using the values from the same column from another dataframe while matching on the values of that "key" column.
Here's what I got. Hope someone could shed light on this. Thanks.
## data frame that contains missing values in "diff" column
df <- data.frame(type = c(1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3),
diff = c(0.1, 0.3, NA, NA, NA, NA, NA, 0.2, 0.7, NA, 0.5, NA),
name = c("A", "B", "C", "D", "E", "A", "B", "C", "F", "A", "B", "C"))
## replace with values from this smaller data frame
df2 <- data.frame(diff_rep = c(0.3, 0.2, 0.4), name = c("A", "B", "C"))
## replace using ifelse
df$diff <- ifelse(is.na(df$diff) & (df$type == 2), df2$diff_rep , df$diff)
df
type diff name
1 1 0.1 A
2 1 0.3 B
3 1 NA C
4 2 0.3 D
5 2 0.2 E
6 2 0.4 A
7 2 0.3 B
8 2 0.2 C
9 2 0.7 F
10 3 NA A
11 3 0.5 B
12 3 NA C
## desired output
type diff name
1 1 0.1 A
2 1 0.3 B
3 1 NA C
4 2 NA D
5 2 NA E
6 2 0.3 A
7 2 0.2 B
8 2 0.4 C
9 2 0.7 F
10 3 NA A
11 3 0.5 B
12 3 NA C
Assuminhg row 9 is a mistake, you can use a left join first and then use ifelse() and coalesce() to get your desired result. coalesce() returns the first non-missing value
left_join(df, df2, by = "name") %>%
mutate(diff_wanted = if_else(type == 2,
coalesce(diff, diff_rep),
diff),
diff_wanted = ifelse(name %in% df2$name,
diff_wanted,
NA)) %>%
select(type, diff_wanted, name)

calculate mean by group by avoiding first value in the group in R

I have a big dataframe like this:
groupvar <- c("A", "A", "A", "A", "B", "B", "B", "C", "C", "C", "C", "D", "D", "D", "E", "E")
valuevar <- c( 1, 0.5, 0.5, 0.5, 1, 0.75, 0.75, 1, 0.8, 0.8, 0.8, 1, 0.9, 0.9, 1, 1.5)
myd <- data.frame (groupvar, valuevar)
groupvar valuevar
1 A 1.00
2 A 0.50
3 A 0.50
4 A 0.50
5 B 1.00
6 B 0.75
7 B 0.75
8 C 1.00
9 C 0.80
10 C 0.80
11 C 0.80
12 D 1.00
13 D 0.90
14 D 0.90
15 E 1.00
16 E 1.50
I would like to calculate means but want to avoid the first value in first element in each groupvar. For example 1 is value given to first value in each group. For example for group "A" the average will be based on 0.5, 0.5, 0.5 avoiding first value 1.
This what I was thinking:
meanfun <- function(x)sum(x)-x[1]/ length(x)
ddply (myd,"groupvar",meanfun)
Error in FUN(X[[1L]], ...) :
only defined on a data frame with all numeric variables
This can be helpful
> with(myd, tapply(valuevar, groupvar, function(x) mean(x[-1])))
A B C D E
0.50 0.75 0.80 0.90 1.50
Using aggregate
> aggregate(valuevar ~ groupvar, FUN=function(x) mean(x[-1]), data=myd)
groupvar valuevar
1 A 0.50
2 B 0.75
3 C 0.80
4 D 0.90
5 E 1.50
Using ddply
> library(plyr)
> ddply (myd, "groupvar", summarize, MeanVar=mean(valuevar[-1]))
groupvar MeanVar
1 A 0.50
2 B 0.75
3 C 0.80
4 D 0.90
5 E 1.50
You could split the data by groupvar and apply the mean function.
groupvar <- c("A", "A", "A", "A", "B", "B", "B", "C", "C", "C", "C", "D", "D", "D", "E", "E")
valuevar <- c( 1, 0.5, 0.5, 0.5, 1, 0.75, 0.75, 1, 0.8, 0.8, 0.8, 1, 0.9, 0.9, 1, 1.5)
myd <- data.frame (groupvar, valuevar)
lapply(split(myd, f=myd[, "groupvar"]), function(x) mean(x[-1,2]))
What I would do is create a new dataframe that eliminates the first element of the group var. Then I would take the means over the group var.
myd_rmFstElement <- myd[which(duplicated(myd$groupvar)), ]
myd_means <- aggregate(valuevar ~ groupvar, FUN=mean, myd_rmFstElement)

Resources