Exclude rows where value used in another row - r

Imagine you have the following data set:
df = data.frame(ID = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20), gender= c(1,2,1,2,2,2,2,1,1,2,1,2,1,2,2,2,2,1,1,2),
PID = c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10))
how can I write a code that removes the rows in the df whose gender and PID are the same (see picture). Please imagine that the code is over 1000 rows long (so it should be a solution that automatically searches for the right values to exclude).

base R
df[ave(rep(TRUE, nrow(df)), df[,c("gender","paar")], FUN = function(z) !any(duplicated(z))),]
# ID gender paar
# 1 1 1 1
# 2 2 2 1
# 3 3 1 2
# 4 4 2 2
# 7 7 2 4
# 8 8 1 4
# 9 9 1 5
# 10 10 2 5
# 11 11 1 6
# 12 12 2 6
# 13 13 1 7
# 14 14 2 7
# 17 17 2 9
# 18 18 1 9
# 19 19 1 10
# 20 20 2 10
dplyr
library(dplyr)
df %>%
group_by(gender, paar) %>%
filter(!any(duplicated(cbind(gender, paar)))) %>%
ungroup()

In base R, we may use subset after removing the observations where the group count for 'gender' and 'paar' are not 1
subset(df, ave(seq_along(gender), gender, paar, FUN = length) == 1)
Or with duplicated
df[!(duplicated(df[-1])|duplicated(df[-1], fromLast = TRUE)),]
-output
ID gender paar
1 1 1 1
2 2 2 1
3 3 1 2
4 4 2 2
7 7 2 4
8 8 1 4
9 9 1 5
10 10 2 5
11 11 1 6
12 12 2 6
13 13 1 7
14 14 2 7
17 17 2 9
18 18 1 9
19 19 1 10
20 20 2 10

Here is one more: :-)
library(dplyr)
df %>%
group_by(gender, PID) %>%
filter(is.na(ifelse(n()>1, 1, NA)))
ID gender PID
<dbl> <dbl> <dbl>
1 1 1 1
2 2 2 1
3 3 1 2
4 4 2 2
5 7 2 4
6 8 1 4
7 9 1 5
8 10 2 5
9 11 1 6
10 12 2 6
11 13 1 7
12 14 2 7
13 17 2 9
14 18 1 9
15 19 1 10
16 20 2 10

Another dplyr option could be:
df %>%
filter(with(rle(paste0(gender, PID)), rep(lengths == 1, lengths)))
ID gender PID
1 1 1 1
2 2 2 1
3 3 1 2
4 4 2 2
5 7 2 4
6 8 1 4
7 9 1 5
8 10 2 5
9 11 1 6
10 12 2 6
11 13 1 7
12 14 2 7
13 17 2 9
14 18 1 9
15 19 1 10
16 20 2 10
If the duplicated values can occur also between non-consecutive rows:
df %>%
arrange(gender, PID) %>%
filter(with(rle(paste0(gender, PID)), rep(lengths == 1, lengths)))

Using aggregate
na.omit(aggregate(. ~ gender + PID, df, function(x)
ifelse(length(x) == 1, x, NA)))
gender PID ID
1 1 1 1
2 2 1 2
3 1 2 3
4 2 2 4
6 1 4 8
7 2 4 7
8 1 5 9
9 2 5 10
10 1 6 11
11 2 6 12
12 1 7 13
13 2 7 14
15 1 9 18
16 2 9 17
17 1 10 19
18 2 10 20
With dplyr
library(dplyr)
df %>%
group_by(gender, PID) %>%
filter(n() == 1) %>%
ungroup()
# A tibble: 16 × 3
ID gender PID
<dbl> <dbl> <dbl>
1 1 1 1
2 2 2 1
3 3 1 2
4 4 2 2
5 7 2 4
6 8 1 4
7 9 1 5
8 10 2 5
9 11 1 6
10 12 2 6
11 13 1 7
12 14 2 7
13 17 2 9
14 18 1 9
15 19 1 10
16 20 2 10

Related

Replace row value in a data frame group by the smallest value in that group

I have the following data set:
time <- c(0,1,2,3,4,5,0,1,2,3,4,5,0,1,2,3,4,5)
value <- c(10,8,6,5,3,2,12,10,6,5,4,2,20,15,16,9,2,2)
group <- c(1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3)
data <- data.frame(time, value, group)
I want to create a new column called data$diff that is equal to data$value minus the value of data$value when data$time == 0 within each group.
I am beginning with the following code
for(i in 1:nrow(data)){
for(n in 1:max(data$group)){
if(data$group[i] == n) {
data$diff[i] <- ???????
}
}
}
But cannot figure out what to put in place of the question marks. The desired output would be this table: https://i.stack.imgur.com/1bAKj.png
Any thoughts are appreciated.
Since in your example data$time == 0 is always the first element of the group, you can use this data.table approach.
library(data.table)
setDT(data)
data[, diff := value[1] - value, by = group]
In case that data$time == 0 is not the first element in each group you can use this:
data[, diff := value[time==0] - value, by = group]
Output:
> data
time value group diff
1: 0 10 1 0
2: 1 8 1 2
3: 2 6 1 4
4: 3 5 1 5
5: 4 3 1 7
6: 5 2 1 8
7: 0 12 2 0
8: 1 10 2 2
9: 2 6 2 6
10: 3 5 2 7
11: 4 4 2 8
12: 5 2 2 10
13: 0 20 3 0
14: 1 15 3 5
15: 2 16 3 4
16: 3 9 3 11
17: 4 2 3 18
18: 5 2 3 18
Here is a base R approach.
within(data, diff <- ave(
seq_along(value), group,
FUN = \(i) value[i][time[i] == 0] - value[i]
))
Output
time value group diff
1 0 10 1 0
2 1 8 1 2
3 2 6 1 4
4 3 5 1 5
5 4 3 1 7
6 5 2 1 8
7 0 12 2 0
8 1 10 2 2
9 2 6 2 6
10 3 5 2 7
11 4 4 2 8
12 5 2 2 10
13 0 20 3 0
14 1 15 3 5
15 2 16 3 4
16 3 9 3 11
17 4 2 3 18
18 5 2 3 18
Here is a short way to do it with dplyr.
library(dplyr)
data %>%
group_by(group) %>%
mutate(diff = value[which(time == 0)] - value)
Which gives
# Groups: group [3]
time value group diff
<dbl> <dbl> <dbl> <dbl>
1 0 10 1 0
2 1 8 1 2
3 2 6 1 4
4 3 5 1 5
5 4 3 1 7
6 5 2 1 8
7 0 12 2 0
8 1 10 2 2
9 2 6 2 6
10 3 5 2 7
11 4 4 2 8
12 5 2 2 10
13 0 20 3 0
14 1 15 3 5
15 2 16 3 4
16 3 9 3 11
17 4 2 3 18
18 5 2 3 18
library(dplyr)
vals2use <- data %>%
group_by(group) %>%
filter(time==0) %>%
select(c(2,3)) %>%
rename(value4diff=value)
dataNew <- merge(data, vals2use, all=T)
dataNew$diff <- dataNew$value4diff-dataNew$value
dataNew <- dataNew[,c(1,2,3,5)]
dataNew
group time value diff
1 1 0 10 0
2 1 1 8 2
3 1 2 6 4
4 1 3 5 5
5 1 4 3 7
6 1 5 2 8
7 2 0 12 0
8 2 1 10 2
9 2 2 6 6
10 2 3 5 7
11 2 4 4 8
12 2 5 2 10
13 3 0 20 0
14 3 1 15 5
15 3 2 16 4
16 3 3 9 11
17 3 4 2 18
18 3 5 2 18

Use apply to create a list of adjacency matrices from dataframe in R

I have an edgelist of friendships with 5 different schools over 3 waves. I'd like to create a list for each school that contains 3 adjacency matrices (one for each wave). I can do this one by one, but I would like to use a loop or an apply function to automate it.
This is the code I have used for one school and wave:
school1_w1 <- filter(edges, school == 1 & wave == 1) %>%
graph_from_data_frame(., directed = TRUE) %>%
as_adjacency_matrix() %>% as.matrix()
school1_w2 <- filter(edges, school == 1 & wave == 2) %>%
graph_from_data_frame(., directed = TRUE) %>%
as_adjacency_matrix() %>% as.matrix()
school1_w3 <- filter(edges, school == 1 & wave == 3) %>%
graph_from_data_frame(., directed = TRUE) %>%
as_adjacency_matrix() %>% as.matrix()
school1 <- list(school1_w1, school1_w2, school1_w3)
How can I do this for all 5 schools with an apply or loop? Sample data below:
ego alter wave school
1 4 1 1
1 4 2 1
1 3 3 1
2 3 1 1
2 4 2 1
2 4 3 1
3 1 1 1
3 2 2 1
3 3 3 1
4 1 1 1
4 1 2 1
4 1 3 1
5 8 1 2
5 6 2 2
5 7 3 2
6 7 1 2
6 7 2 2
6 7 3 2
7 8 1 2
7 6 2 2
7 6 3 2
8 7 1 2
8 7 2 2
8 7 3 2
9 10 1 3
9 11 2 3
9 12 3 3
10 11 1 3
10 11 2 3
10 9 3 3
11 12 1 3
11 10 2 3
11 12 3 3
12 9 1 3
12 10 2 3
12 10 3 3
13 14 1 4
13 15 2 4
13 16 3 4
14 16 1 4
14 16 2 4
14 13 3 4
15 16 1 4
15 16 2 4
15 16 3 4
16 15 1 4
16 15 2 4
16 15 3 4
17 20 1 5
17 18 2 5
17 18 3 5
18 19 1 5
18 20 2 5
18 19 3 5
19 17 1 5
19 17 2 5
19 17 3 5
20 18 1 5
20 17 2 5
20 17 3 5
We can use split + lapply :
library(igraph)
result <- lapply(split(edges, list(edges$school, edges$wave)), function(x) {
graph_from_data_frame(x, directed = TRUE) %>%
as_adjacency_matrix() %>% as.matrix()
})
Or with by :
result <- by(edges, list(edges$school, edges$wave), function(x) {
graph_from_data_frame(x, directed = TRUE) %>%
as_adjacency_matrix() %>% as.matrix()
})

R: Separate data into combinations of two columns

I have some data where each id is measured by different types which can be have different values type_val. The measured value is val. A small dummy data is like this:
df <- data.frame(id=rep(letters[1:2],6),
type=c(rep('t1',6), rep('t2',6)),
type_val=rep(c(1,1,2,2,3,3),2),
val=1:12)
Then df is:
id type type_val val
1 a t1 1 1
2 b t1 1 2
3 a t1 2 3
4 b t1 2 4
5 a t1 3 5
6 b t1 3 6
7 a t2 1 7
8 b t2 1 8
9 a t2 2 9
10 b t2 2 10
11 a t2 3 11
12 b t2 3 12
I need to spread/cast data so that all combinations of type and type_val for each id are row-wise. I think this must be a job for pkgs reshape2 or tidyr but I have completely failed to generate anything other than errors.
The outcome data structure - somewhat redundant - would be something like this (hope I got it right!) where pairs of type (as given by combinations of the type_val) are columns type_t1 and type_t2 , and their associated values (val in df) are val_t1 and val_t2 - columns names are of cause arbitrary :
id type_t1 type_t2 val_t1 val_t2
1 a 1 1 1 7
2 a 1 2 1 9
3 a 1 3 1 11
4 a 2 1 3 7
5 a 2 2 3 9
6 a 2 3 3 11
7 a 3 1 5 7
8 a 3 2 5 9
9 a 3 3 5 11
10 b 1 1 2 8
11 b 1 2 2 10
12 b 1 3 2 12
13 b 2 1 4 8
14 b 2 2 4 10
15 b 2 3 4 12
16 b 3 1 6 8
17 b 3 2 6 10
18 b 3 3 6 12
UPDATE
Note that (#Sotos)
> spread(df, type, val)
id type_val t1 t2
1 a 1 1 7
2 a 2 3 9
3 a 3 5 11
4 b 1 2 8
5 b 2 4 10
6 b 3 6 12
is not the desired output - it fails to deliver the wide format defined by combinations of type and type_val in df.
how about this:
df1=df[df$type=="t1",]
df2=df[df$type=="t2",]
DF=merge(df1,df2,by="id")
DF=DF[,-c(2,5)]
colnames(DF)<-c("id", "type_t1", "val_t1","type_t2", "val_t2")
Here is something more generic that will work with an arbitrary number of unique type:
library(dplyr)
# This function takes a list of dataframes (.data) and merges them by ID
reduce_merge <- function(.data, ID) {
return(Reduce(function(x, y) merge(x, y, by = ID), .data))
}
# This function renames the cols columns in .data by appending _identifier
batch_rename <- function(.data, cols, identifier, sep = '_') {
return(plyr::rename(.data, sapply(cols, function(x){
x = paste(x, .data[1, identifier], sep = sep)
})))
}
# This function creates a list of subsetted dataframes
# (subsetted by values of key),
# uses batch_rename() to give each dataframe more informative column names,
# merges them together, and returns the columns you'd like in a sensible order
multi_spread <- function(.data, grp, key, vals) {
.data %>%
plyr::dlply(key, subset) %>%
lapply(batch_rename, vals, key) %>%
reduce_merge(grp) %>%
select(-starts_with(paste0(key, '.'))) %>%
select(id, sort(setdiff(colnames(.), c(grp, key, vals))))
}
# Your example
df <- data.frame(id=rep(letters[1:2],6),
type=c(rep('t1',6), rep('t2',6)),
type_val=rep(c(1,1,2,2,3,3),2),
val=1:12)
df %>% multi_spread('id', 'type', c('type_val', 'val'))
id type_val_t1 type_val_t2 val_t1 val_t2
1 a 1 1 1 7
2 a 1 2 1 9
3 a 1 3 1 11
4 a 2 1 3 7
5 a 2 2 3 9
6 a 2 3 3 11
7 a 3 1 5 7
8 a 3 2 5 9
9 a 3 3 5 11
10 b 1 1 2 8
11 b 1 2 2 10
12 b 1 3 2 12
13 b 2 1 4 8
14 b 2 2 4 10
15 b 2 3 4 12
16 b 3 1 6 8
17 b 3 2 6 10
18 b 3 3 6 12
# An example with three unique values of 'type'
df <- data.frame(id = rep(letters[1:2], 9),
type = c(rep('t1', 6), rep('t2', 6), rep('t3', 6)),
type_val = rep(c(1, 1, 2, 2, 3, 3), 3),
val = 1:18)
df %>% multi_spread('id', 'type', c('type_val', 'val'))
id type_val_t1 type_val_t2 type_val_t3 val_t1 val_t2 val_t3
1 a 1 1 1 1 7 13
2 a 1 1 2 1 7 15
3 a 1 1 3 1 7 17
4 a 1 2 1 1 9 13
5 a 1 2 2 1 9 15
6 a 1 2 3 1 9 17
7 a 1 3 1 1 11 13
8 a 1 3 2 1 11 15
9 a 1 3 3 1 11 17
10 a 2 1 1 3 7 13
11 a 2 1 2 3 7 15
12 a 2 1 3 3 7 17
13 a 2 2 1 3 9 13
14 a 2 2 2 3 9 15
15 a 2 2 3 3 9 17
16 a 2 3 1 3 11 13
17 a 2 3 2 3 11 15
18 a 2 3 3 3 11 17
19 a 3 1 1 5 7 13
20 a 3 1 2 5 7 15
21 a 3 1 3 5 7 17
22 a 3 2 1 5 9 13
23 a 3 2 2 5 9 15
24 a 3 2 3 5 9 17
25 a 3 3 1 5 11 13
26 a 3 3 2 5 11 15
27 a 3 3 3 5 11 17
28 b 1 1 1 2 8 14
29 b 1 1 2 2 8 16
30 b 1 1 3 2 8 18
31 b 1 2 1 2 10 14
32 b 1 2 2 2 10 16
33 b 1 2 3 2 10 18
34 b 1 3 1 2 12 14
35 b 1 3 2 2 12 16
36 b 1 3 3 2 12 18
37 b 2 1 1 4 8 14
38 b 2 1 2 4 8 16
39 b 2 1 3 4 8 18
40 b 2 2 1 4 10 14
41 b 2 2 2 4 10 16
42 b 2 2 3 4 10 18
43 b 2 3 1 4 12 14
44 b 2 3 2 4 12 16
45 b 2 3 3 4 12 18
46 b 3 1 1 6 8 14
47 b 3 1 2 6 8 16
48 b 3 1 3 6 8 18
49 b 3 2 1 6 10 14
50 b 3 2 2 6 10 16
51 b 3 2 3 6 10 18
52 b 3 3 1 6 12 14
53 b 3 3 2 6 12 16
54 b 3 3 3 6 12 18

Data Frame Filter Values

Suppose I have the next data frame.
table<-data.frame(group=c(0,5,10,15,20,25,30,35,40,0,5,10,15,20,25,30,35,40,0,5,10,15,20,25,30,35,40),plan=c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3),price=c(1,4,5,6,8,9,12,12,12,3,5,6,7,10,12,20,20,20,5,6,8,12,15,20,22,28,28))
group plan price
1 0 1 1
2 5 1 4
3 10 1 5
4 15 1 6
5 20 1 8
6 25 1 9
7 30 1 12
8 35 1 12
9 40 1 12
10 0 2 3
11 5 2 5
12 10 2 6
13 15 2 7
14 20 2 10
15 25 2 12
16 30 2 20
17 35 2 20
18 40 2 20
How can I get the values from the table up to the maximum price, without duplicates.
So the result would be:
group plan price
1 0 1 1
2 5 1 4
3 10 1 5
4 15 1 6
5 20 1 8
6 25 1 9
7 30 1 12
10 0 2 3
11 5 2 5
12 10 2 6
13 15 2 7
14 20 2 10
15 25 2 12
16 30 2 20
You can use slice in dplyr:
library(dplyr)
table %>%
group_by(plan) %>%
slice(1:which.max(price == max(price)))
which.max gives the index of the first occurrence of price == max(price). Using that, I can slice the data.frame to only keep rows for each plan up to the maximum price.
Result:
# A tibble: 22 x 3
# Groups: plan [3]
group plan price
<dbl> <dbl> <dbl>
1 0 1 1
2 5 1 4
3 10 1 5
4 15 1 6
5 20 1 8
6 25 1 9
7 30 1 12
8 0 2 3
9 5 2 5
10 10 2 6
# ... with 12 more rows

count positive negative values in column by group

I want to create two variables giving me the total number of positive and negative values by id, hopefully using dplyr.
Example data:
library(dplyr)
set.seed(42)
df <- data.frame (id=rep(1:10,each=10),
ff=rnorm(100, 0,14 ))
> head(df,20)
id ff
1 1 19.1934183
2 1 -7.9057744
3 1 5.0837978
4 1 8.8600765
5 1 5.6597565
6 1 -1.4857432
7 1 21.1613080
8 1 -1.3252265
9 1 28.2579320
10 1 -0.8779974
11 2 18.2681752
12 2 32.0130355
13 2 -19.4440498
14 2 -3.9030427
15 2 -1.8664987
16 2 8.9033056
17 2 -3.9795409
18 2 -37.1903759
19 2 -34.1665370
20 2 18.4815868
the resulting dataset should look like:
> head(df,20)
id ff pos neg
1 1 19.1934183 6 4
2 1 -7.9057744 6 4
3 1 5.0837978 6 4
4 1 8.8600765 6 4
5 1 5.6597565 6 4
6 1 -1.4857432 6 4
7 1 21.1613080 6 4
8 1 -1.3252265 6 4
9 1 28.2579320 6 4
10 1 -0.8779974 6 4
11 2 18.2681752 4 6
12 2 32.0130355 4 6
13 2 -19.4440498 4 6
14 2 -3.9030427 4 6
15 2 -1.8664987 4 6
16 2 8.9033056 4 6
17 2 -3.9795409 4 6
18 2 -37.1903759 4 6
19 2 -34.1665370 4 6
20 2 18.4815868 4 6
I have thought something similar to this will work:
df<-df%>% group_by(id) %>% mutate(pos= nrow(ff>0)) %>% ungroup()
Any help would be great, thanks.
You need sum():
df %>% group_by(id) %>%
mutate(pos = sum(ff>0),
neg = sum(ff<0))
For a fun (and a fast) solution data.table can also be used:
library(data.table)
setDT(df)
df[, ":="(pos = sum(ff > 0), neg = sum(ff < 0)), by = id]
Here's an answer that add the ifelse part of your question:
df <- df %>% group_by(id) %>%
mutate(pos = sum(ff>0), neg = sum(ff<0)) %>%
group_by(id) %>%
mutate(any_neg=ifelse(any(ff < 0), 1, 0))
Output:
> head(df, 20)
Source: local data frame [20 x 5]
Groups: id [2]
id ff pos neg any_neg
<int> <dbl> <int> <int> <dbl>
1 1 19.1934183 6 4 1
2 1 -7.9057744 6 4 1
3 1 5.0837978 6 4 1
4 1 8.8600765 6 4 1
5 1 5.6597565 6 4 1
6 1 -1.4857432 6 4 1
7 1 21.1613080 6 4 1
8 1 -1.3252265 6 4 1
9 1 28.2579320 6 4 1
10 1 -0.8779974 6 4 1
11 2 18.2681752 4 6 1
12 2 32.0130355 4 6 1
13 2 -19.4440498 4 6 1
14 2 -3.9030427 4 6 1
15 2 -1.8664987 4 6 1
16 2 8.9033056 4 6 1
17 2 -3.9795409 4 6 1
18 2 -37.1903759 4 6 1
19 2 -34.1665370 4 6 1
20 2 18.4815868 4 6 1

Resources