Replace going on NA values with sum of another column - r

I am trying to replace all going on NA values with sum of values from another column, but I'm a little confused.
How the data looks like
df
# Distance Distance2
# 1 160 8
# 2 20 NA
# 3 30 15
# 4 100 11
# 5 35 NA
# 6 42 NA
# 7 10 NA
# 8 10 2
# 9 9 NA
# 10 20 NA
And am looking to get a result like this
df
# Distance Distance2
# 1 160 8
# 2 20 20
# 3 30 15
# 4 100 11
# 5 35 87
# 6 42 87
# 7 10 87
# 8 10 2
# 9 9 29
# 10 20 29
Thanks in advance for your help

We can use rleid to create groups and replace NA with sum of Distance values.
library(data.table)
setDT(df)[, Distance_new := replace(Distance2, is.na(Distance2),
sum(Distance)), rleid(Distance2)]
df
# Distance Distance2 Distance_new
# 1: 160 8 8
# 2: 20 NA 20
# 3: 30 15 15
# 4: 100 11 11
# 5: 35 NA 87
# 6: 42 NA 87
# 7: 10 NA 87
# 8: 10 2 2
# 9: 9 NA 29
#10: 20 NA 29
We can also use this in dplyr :
library(dplyr)
df %>%
group_by(gr = rleid(Distance2)) %>%
mutate(Distance_new = replace(Distance2, is.na(Distance2), sum(Distance)))
data
df <- structure(list(Distance = c(160L, 20L, 30L, 100L, 35L, 42L, 10L,
10L, 9L, 20L), Distance2 = c(8L, NA, 15L, 11L, NA, NA, NA, 2L,
NA, NA)), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10"))

You can group by consecutive NAs and replace with the sum, i.e.
library(dplyr)
df %>%
group_by(grp = cumsum(c(TRUE, diff(is.na(df$Distance2)) != 0))) %>%
mutate(Distance2 = replace(Distance2, is.na(Distance2), sum(Distance)))
# A tibble: 10 x 3
# Groups: grp [6]
Distance Distance2 grp
<int> <int> <int>
1 160 8 1
2 20 20 2
3 30 15 3
4 100 11 3
5 35 87 4
6 42 87 4
7 10 87 4
8 10 2 5
9 9 29 6
10 20 29 6

We can use fcoalesce
library(data.table)
library(zoo)
setDT(df)[, Distance2 := fcoalesce(Distance2, na.aggregate(Distance, FUN = sum)),
rleid(Distance2)]
data
df <- structure(list(Distance = c(160L, 20L, 30L, 100L, 35L, 42L, 10L,
10L, 9L, 20L), Distance2 = c(8L, NA, 15L, 11L, NA, NA, NA, 2L,
NA, NA)), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10"))

Related

combine two datasets with different dimension by ID

I have two datasets:
df1:
ID score
1 1 30
2 1 10
3 1 22
4 2 44
5 2 6
6 3 5
7 3 20
8 4 35
9 4 2
10 4 60
11 5 14
12 5 5
df2:
ID para1 para2
1 1 10 5
2 1 10 5
3 2 20 10
4 2 20 10
5 3 30 15
6 4 40 20
7 4 40 20
8 4 40 20
9 4 40 20
10 5 50 25
11 5 50 25
12 5 50 25
13 6 60 30
14 6 60 30
I would like to combine df1 and df2 by ID and get df3 below. Tried merge and left_join but they don't work well as I probably missed something. Any simple way to get this?
df3:
ID score para1 para2
1 1 30 10 5
2 1 10 10 5
3 1 22 10 5
4 2 44 20 10
5 2 6 20 10
6 3 5 30 15
7 3 20 30 15
8 4 35 40 20
9 4 2 40 20
10 4 60 40 20
11 5 14 50 25
12 5 5 50 25
One option to achieve your desired result would be to first get rid of the duplicated rows in your df2 using e.g. dplyr::distinct:
library(dplyr)
df1 %>%
left_join(distinct(df2, ID, para1, para2))
#> Joining, by = "ID"
#> ID score para1 para2
#> 1 1 30 10 5
#> 2 1 10 10 5
#> 3 1 22 10 5
#> 4 2 44 20 10
#> 5 2 6 20 10
#> 6 3 5 30 15
#> 7 3 20 30 15
#> 8 4 35 40 20
#> 9 4 2 40 20
#> 10 4 60 40 20
#> 11 5 14 50 25
#> 12 5 5 50 25
DATA
df1 <- structure(list(ID = c(
1L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 4L,
5L, 5L
), score = c(
30L, 10L, 22L, 44L, 6L, 5L, 20L, 35L, 2L,
60L, 14L, 5L
)), class = "data.frame", row.names = c(
"1", "2",
"3", "4", "5", "6", "7", "8", "9", "10", "11", "12"
))
df2 <- structure(list(ID = c(
1L, 1L, 2L, 2L, 3L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 6L, 6L
), para1 = c(
10L, 10L, 20L, 20L, 30L, 40L, 40L,
40L, 40L, 50L, 50L, 50L, 60L, 60L
), para2 = c(
5L, 5L, 10L, 10L,
15L, 20L, 20L, 20L, 20L, 25L, 25L, 25L, 30L, 30L
)), class = "data.frame", row.names = c(
"1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14"
))
Another approach using data.table would be to do a join and keep the first matched row in the second data.frame.
In this case, take the subset of df2, where that key column's values match df1's key column's values based on ID key.
You can also include a nomatch argument to fill in a value if there's no match. See ?data.table for more details.
library(data.table)
setDT(df1)
setDT(df2)
df2[df1, mult = "first", on = "ID"]
Output
ID para1 para2 score
1: 1 10 5 30
2: 1 10 5 10
3: 1 10 5 22
4: 2 20 10 44
5: 2 20 10 6
6: 3 30 15 5
7: 3 30 15 20
8: 4 40 20 35
9: 4 40 20 2
10: 4 40 20 60
11: 5 50 25 14
12: 5 50 25 5

Is there a way to automatically average multiple treatments at once in R? [duplicate]

This question already has answers here:
Calculate the mean by group
(9 answers)
Closed 2 years ago.
Very sorry if this is a reposted question, I checked the search engine and couldn't find the answer I was looking for. Say I have the following dataset:
Plot Plant Count
1 101 1 9
2 101 2 15
3 101 3 5
4 101 4 15
5 101 5 26
6 102 1 9
7 102 2 26
8 102 3 9
9 102 4 15
10 102 5 17
11 103 1 12
12 103 2 6
13 103 3 22
14 103 4 12
15 103 5 6
I'd like to average the "Count" number between the 5 plants of each plot. However, in my real dataset, I have much more than 3 plots. Is there a way to write my code so that it automatically averages all my plots at once? I'd like to learn to write a code that would get me the average for each plot as efficiently as possible. Any help would be very much appreciated.
I am fairly new to stackoverflow and am not the strongest with R, so if I have made a mistake in my formatting or something similar please let me know. Thanks for your time!
Try this with dplyr using group_by() and summarise(). Here the code:
library(dplyr)
#Data
newdf <- df %>% group_by(Plot) %>% summarise(Avg=mean(Count))
Output:
# A tibble: 3 x 2
Plot Avg
<int> <dbl>
1 101 14
2 102 15.2
3 103 11.6
Some data used:
#Data
df <- structure(list(Plot = c(101L, 101L, 101L, 101L, 101L, 102L, 102L,
102L, 102L, 102L, 103L, 103L, 103L, 103L, 103L), Plant = c(1L,
2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), Count = c(9L,
15L, 5L, 15L, 26L, 9L, 26L, 9L, 15L, 17L, 12L, 6L, 22L, 12L,
6L)), class = "data.frame", row.names = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"))
If you want to keep your variables use mutate() in this way:
#Code
newdf <- df %>% group_by(Plot) %>% mutate(Avg=mean(Count))
Output:
# A tibble: 15 x 4
# Groups: Plot [3]
Plot Plant Count Avg
<int> <int> <int> <dbl>
1 101 1 9 14
2 101 2 15 14
3 101 3 5 14
4 101 4 15 14
5 101 5 26 14
6 102 1 9 15.2
7 102 2 26 15.2
8 102 3 9 15.2
9 102 4 15 15.2
10 102 5 17 15.2
11 103 1 12 11.6
12 103 2 6 11.6
13 103 3 22 11.6
14 103 4 12 11.6
15 103 5 6 11.6
Or using base R:
#Base R
newdf <- aggregate(Count~Plot,data=df,mean)
Output:
Plot Count
1 101 14.0
2 102 15.2
3 103 11.6

How can I identify the first row with value lower than the first row in different column in groups in R?

I have a data set that looks like this:
unique score value day
1 2 52 33.75 1
2 2 39 36.25 2
3 3 47 41.25 1
4 3 26 41.00 2
5 3 17 32.25 3
6 3 22 28.00 4
7 3 11 19.00 5
8 3 9 14.75 6
9 3 20 15.50 7
10 4 32 18.00 1
11 4 20 20.25 2
12 5 32 26.00 1
13 5 31 28.75 2
14 5 25 27.00 3
15 5 27 28.75 4
16 6 44 31.75 1
17 6 25 30.25 2
18 6 31 31.75 3
19 6 37 34.25 4
20 6 28 30.25 5
I would like to identify the first row in each group (unique) where the score is lower than the value on day 1.
I have tried this:
result<-df %>%
group_by(unique.id) %>%
filter(dailyMyoActivity < globaltma[globalflareday==1])
But it doesn't seem to do exactly what I want it to do.
Is there a way of doing this?
If I understood your rationale correctly, and if your dataset is already ordered by day, this dplyr solution may come in handy
library(dplyr)
df %>%
group_by(unique) %>%
filter(score < value[day==1]) %>%
slice(1)
Output
# A tibble: 3 x 4
# Groups: unique [3]
# unique score value day
# <int> <int> <dbl> <int>
# 1 3 26 41 2
# 2 5 25 27 3
# 3 6 25 30.2 2
This could help:
library(dplyr)
df %>% group_by(unique) %>% mutate(Index=ifelse(score<value & day==1,1,0))
# A tibble: 20 x 5
# Groups: unique [5]
unique score value day Index
<int> <int> <dbl> <int> <dbl>
1 2 52 33.8 1 0
2 2 39 36.2 2 0
3 3 47 41.2 1 0
4 3 26 41 2 0
5 3 17 32.2 3 0
6 3 22 28 4 0
7 3 11 19 5 0
8 3 9 14.8 6 0
9 3 20 15.5 7 0
10 4 32 18 1 0
11 4 20 20.2 2 0
12 5 32 26 1 0
13 5 31 28.8 2 0
14 5 25 27 3 0
15 5 27 28.8 4 0
16 6 44 31.8 1 0
17 6 25 30.2 2 0
18 6 31 31.8 3 0
19 6 37 34.2 4 0
20 6 28 30.2 5 0
Then you filter by Index==1
We could also use slice
library(dplyr)
df1 %>%
group_by(unique) %>%
slice(which(score < value[day == 1])[1])
# A tibble: 3 x 4
# Groups: unique [3]
# unique score value day
# <int> <int> <dbl> <int>
#1 3 26 41 2
#2 5 25 27 3
#3 6 25 30.2 2
data
df1 <- structure(list(unique = c(2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L), score = c(52L, 39L,
47L, 26L, 17L, 22L, 11L, 9L, 20L, 32L, 20L, 32L, 31L, 25L, 27L,
44L, 25L, 31L, 37L, 28L), value = c(33.75, 36.25, 41.25, 41,
32.25, 28, 19, 14.75, 15.5, 18, 20.25, 26, 28.75, 27, 28.75,
31.75, 30.25, 31.75, 34.25, 30.25), day = c(1L, 2L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 1L, 2L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 5L)),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20"))
Given that you have asked for identifying the first row which fulfills the criterion score < value a new column which gives you the row number has been added.
result <- df %>%
mutate(row_nr = row_number()) %>%
group_by(unique) %>%
filter(score < value) %>%
slice(1)

How to subtract one row from multiple rows by group, for data set with multiple columns in R?

I would like to learn how to subtract one row from multiple rows by group, and save the results as a data table/matrix in R. For example, take the following data frame:
data.frame("patient" = c("a","a","a", "b","b","b","c","c","c"), "Time" = c(1,2,3), "Measure 1" = sample(1:100,size = 9,replace = TRUE), "Measure 2" = sample(1:100,size = 9,replace = TRUE), "Measure 3" = sample(1:100,size = 9,replace = TRUE))
patient Time Measure.1 Measure.2 Measure.3
1 a 1 19 5 75
2 a 2 64 20 74
3 a 3 40 4 78
4 b 1 80 91 80
5 b 2 48 31 73
6 b 3 10 5 4
7 c 1 30 67 55
8 c 2 24 13 90
9 c 3 45 31 88
For each patient, I would like to subtract the row where Time == 1 from all rows associated with that patient. The result would be:
patient Time Measure.1 Measure.2 Measure.3
1 a 1 0 0 0
2 a 2 45 15 -1
3 a 3 21 -1 3
4 b 1 0 0 0
5 b 2 -32 -60 -5
6 b 3 -70 -86 -76
7 c 1 0 0 0
....
I have tried the following code using the dplyr package, but to no avail:
raw_patient<- group_by(rawdata,patient, Time)
baseline_patient <-mutate(raw_patient,cpls = raw_patient[,]- raw_patient["Time" == 0,])
As there are multiple columns, we can use mutate_at by specifying the variables in vars and then subtract the elements from those elements in each column that corresponds to 'Time' 1 after grouping by 'patient'
library(dplyr)
df1 %>%
group_by(patient) %>%
mutate_at(vars(matches("Measure")), funs(.- .[Time==1]))
# A tibble: 9 × 5
# Groups: patient [3]
# patient Time Measure.1 Measure.2 Measure.3
# <chr> <int> <int> <int> <int>
#1 a 1 0 0 0
#2 a 2 45 15 -1
#3 a 3 21 -1 3
#4 b 1 0 0 0
#5 b 2 -32 -60 -7
#6 b 3 -70 -86 -76
#7 c 1 0 0 0
#8 c 2 -6 -54 35
#9 c 3 15 -36 33
data
df1 <- structure(list(patient = c("a", "a", "a", "b", "b", "b", "c",
"c", "c"), Time = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), Measure.1 = c(19L,
64L, 40L, 80L, 48L, 10L, 30L, 24L, 45L), Measure.2 = c(5L, 20L,
4L, 91L, 31L, 5L, 67L, 13L, 31L), Measure.3 = c(75L, 74L, 78L,
80L, 73L, 4L, 55L, 90L, 88L)), .Names = c("patient", "Time",
"Measure.1", "Measure.2", "Measure.3"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9"))

How to create matrix from list in r

I have a list consist of adjacency list the data look like this
Test User_ID hardest
1 77 A 5
2 77 B 4
3 77 C 4
4 78 A 4
5 78 B 5
6 78 C 4
7 79 A 5
8 79 B 4 ...
I want to make a matrix like column consist of test number and row is consist of User ID
and cell in the matrix is hardest, It looks like below
77 78 79
A 5 4 5
B 4 5 4
C 4 4 ....
how can I convert this list to matrix?
You can try
library(reshape2)
dcast(df, User_ID~Test, value.var='hardest')
# User_ID 77 78 79
#1 A 5 4 5
#2 B 4 5 4
#3 C 4 4 NA
If it is a matrix you want
acast(df, User_ID~Test, value.var='hardest')
# 77 78 79
#A 5 4 5
#B 4 5 4
#C 4 4 NA
Or tidyr
library(tidyr)
spread(df, Test, hardest)
# User_ID 77 78 79
#1 A 5 4 5
#2 B 4 5 4
#3 C 4 4 NA
Or using xtabs
x1 <- xtabs(hardest~User_ID+Test, df)
attr(x1, "call") <- NULL
attr(x1, "class") <- NULL
dimnames(x1) <- unname(dimnames(x1))
x1
# 77 78 79
#A 5 4 5
#B 4 5 4
#C 4 4 0
data
df <- structure(list(Test = c(77L, 77L, 77L, 78L, 78L, 78L, 79L, 79L
), User_ID = c("A", "B", "C", "A", "B", "C", "A", "B"), hardest = c(5L,
4L, 4L, 4L, 5L, 4L, 5L, 4L)), .Names = c("Test", "User_ID", "hardest"
), class = "data.frame", row.names = c("1", "2", "3", "4", "5",
"6", "7", "8"))
From base package, you can try
tapply(df$hardest,df[,c("User_ID","Test")],sum)

Resources