Related
I have a dataset that looks like the one below where there are three "pairs" of columns pertaining to the type (datA, datB, datC), and the total for each type (datA_total, datB_total, datC_total):
structure(list(datA = c(1L, NA, 5L, 3L, 8L, NA), datA_total = c(20L,
30L, 40L, 15L, 10L, NA), datB = c(5L, 5L, NA, 6L, 1L, NA), datB_total = c(80L,
10L, 10L, 5L, 4L, NA), datC = c(NA, 4L, 1L, NA, 3L, NA), datC_total = c(NA,
10L, 15L, NA, 20L, NA)), class = "data.frame", row.names = c(NA,
-6L))
# datA datA_total datB datB_total datC datC_total
#1 1 20 5 80 NA NA
#2 NA 30 5 10 4 10
#3 5 40 NA 10 1 15
#4 3 15 6 5 NA NA
#5 8 10 1 4 3 20
#6 NA NA NA NA NA NA
I'm trying to create a rowSums across each row to determine the total visits across each data type conditional on whether they meet a criteria of having ANY score ranging (1-5).
Here is my thought process:
Select only the variables that are the data types (i.e. datA, datB, datC)
Across each row based on EACH data type, determine if that data type meets a criteria (i.e. datA -> does it contain (1,2,3,4,5))
If that data type column does contain one of the 5 values above ^, then look to its paired total variable and ready that value to be rowSummed (i.e. datA -> does it contain (1,2,3,4,5)? -> if yes, then grab datA_total value = 20).
The goal is to end up with a total column like below:
# datA datA_total datB datB_total datC datC_total overall_total
#1 1 20 5 80 NA NA 100
#2 NA 30 5 10 4 10 20
#3 5 40 NA 10 1 15 55
#4 3 15 6 5 NA NA 15
#5 8 10 1 4 3 20 24
#6 NA NA NA NA NA NA 0
You'll notice that row #2 only contained a total of 20 even though there is 30 in datA_total. This is a result of the conditional selection in that datA for row#2 contains "NA" rather than one of the five scores (1,2,3,4,5). Hence, the datA_total of 30 was not included in the rowSums calculation.
My code below shows the vectors I created and my attempt at a conditional rowSums but I end up getting an error regarding mutate... I'm not sure how to integrate the "conditional pairing" portion of this problem:
type_vars <- c("datA", "datB", "datC")
type_scores <- c("1", "2", "3", "4", "5")
type_visits <- c("datA_total", "datB_total", "datC_total")
df <- df %>%
mutate(overall_total = rowSums(all_of(type_visits[type_vars %in% type_scores])))
Any help/tips would be appreciated
dplyr's across should do the job.
library(dplyr)
# copying your tibble
data <-
tibble(
datA = c(1, NA, 5, 3, 8, NA),
datA_total = c(20, 30, 40, 15, 10, NA),
datB = c(5, 5, NA, 6, 1, NA),
datB_total = c(80, 10, 10, 5, 4, NA),
datC = c(NA, 4, 1, NA, 3, NA),
datC_total = c(NA, 10, 15, NA, 20, NA)
)
data %>%
mutate(across(c('A', 'B', 'C') %>% paste0('dat', .), \(x) (x %in% 1:5) * get(cur_column() %>% paste0(., '_total')), .names = "{col}_aux")) %>%
rowwise() %>%
mutate(overall_total = sum(across(ends_with('aux')), na.rm = TRUE)) %>%
select(any_of(c(names(data), 'overall_total')))
# A tibble: 6 × 7
datA datA_total datB datB_total datC datC_total overall_total
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 20 5 80 NA NA 100
2 NA 30 5 10 4 10 20
3 5 40 NA 10 1 15 55
4 3 15 6 5 NA NA 15
5 8 10 1 4 3 20 24
6 NA NA NA NA NA NA 0
First, we create an 'aux' column for each dat. It is 0 if dat is not within 1:5, and dat_total otherwise. Then we sum ignoring NA.
This question already has answers here:
filter for complete cases in data.frame using dplyr (case-wise deletion)
(7 answers)
Closed 1 year ago.
I would like to remove NA from my data set and then organise them by IDs.
My dataset is similar to this:
df<-read.table (text="ID Name Surname Group A1 A2 A3 Goal Sea
21 Goal Robi A 4 4 4 G No
21 Goal Robi B NA NA NA NA NA
21 Goal Robi C NA NA NA NA NA
21 Goal Robi D 3 4 4 G No
33 Nami Si O NA NA NA NA NA
33 Nami Si P NA NA NA NA NA
33 Nami Si Q 3 4 4 G No
33 Nami Si Z 3 3 3 S No
98 Sara Bat MT 4 4 4 S No
98 Sara Bat NC 4 3 2 D No
98 Sara Bat MF NA NA NA NA NA
98 Sara Bat LC NA NA NA NA NA
66 Noor Shor MF NA NA NA NA NA
66 Noor Shor LC NA NA NA NA NA
66 Noor Shor MT1 4 4 4 G No
66 Noor Shor NC1 2 3 3 D No
", header=TRUE)
By removing NA, rows and columns get a datframe with a lack of NA. So I would like to get this table
ID Name Surname Group_1 A1 A2 A3 Goal_1 Sea_1 Group_2 A1_1 A2_2 A3_3 Goal_2 Sea_2
21 Goal Robi A 4 4 4 G No D 3 4 4 G No
33 Nami Si Q 3 4 4 G No Z 3 3 3 S No
98 Sara Bat MT 4 4 4 S No NC 4 3 2 D No
66 Noor Shor Mt1 4 4 4 G No NC1 2 3 3 D No
Is it possible to get it. It seems we could do it using pivot_longer, but I do not know ho to get it
search for complete.cases()
final = final[complete.cases(final), ]
A possible solution with the Tidyverse:
df <- structure(list(ID = c(21L, 21L, 21L, 21L, 33L, 33L, 33L, 33L,
98L, 98L, 98L, 98L, 66L, 66L, 66L, 66L), Name = c("Goal", "Goal",
"Goal", "Goal", "Nami", "Nami", "Nami", "Nami", "Sara", "Sara",
"Sara", "Sara", "Noor", "Noor", "Noor", "Noor"), Surname = c("Robi",
"Robi", "Robi", "Robi", "Si", "Si", "Si", "Si", "Bat", "Bat",
"Bat", "Bat", "Shor", "Shor", "Shor", "Shor"), Group = c("A",
"B", "C", "D", "O", "P", "Q", "Z", "MT", "NC", "MF", "LC", "MF",
"LC", "MT1", "NC1"), A1 = c(4L, NA, NA, 3L, NA, NA, 3L, 3L, 4L,
4L, NA, NA, NA, NA, 4L, 2L), A2 = c(4L, NA, NA, 4L, NA, NA, 4L,
3L, 4L, 3L, NA, NA, NA, NA, 4L, 3L), A3 = c(4L, NA, NA, 4L, NA,
NA, 4L, 3L, 4L, 2L, NA, NA, NA, NA, 4L, 3L), Goal = c("G", NA,
NA, "G", NA, NA, "G", "S", "S", "D", NA, NA, NA, NA, "G", "D"
), Sea = c("No", NA, NA, "No", NA, NA, "No", "No", "No", "No",
NA, NA, NA, NA, "No", "No")), class = "data.frame", row.names = c(NA,
-16L))
new_df <- df %>%
drop_na() %>%
group_by(ID) %>%
mutate(n = row_number()) %>%
pivot_wider(
names_from = n,
values_from= c(Group, A1, A2, A3, Goal, Sea)
) %>%
relocate(ends_with("2"), .after= last_col())
print(new_df)
We can group_by the ID columns and then filter out rows with all NAs in the target columns:
df %>% group_by(ID, Name, Surname) %>%
filter(!if_all(A1:Sea, is.na))%>%
slice_head(n=1)
# A tibble: 4 × 9
# Groups: ID, Name, Surname [4]
ID Name Surname Group A1 A2 A3 Goal Sea
<int> <chr> <chr> <chr> <int> <int> <int> <chr> <chr>
1 21 Goal Robi A 4 4 4 G No
2 33 Nami Si Q 3 4 4 G No
3 66 Noor Shor MT1 4 4 4 G No
4 98 Sara Bat MT 4 4 4 S No
I need to check if rows are partially duplicated and delete/overwrite those where 2 columns match a different row where 3 values are present. one problem is, that the "real" dataframe contains a couple of list columns which makes some operations unfeasible. Best case would be if any row where a match can be found would be checked independently of column numbers - meaning only the row with the most columns having non NA values (out of all which include matching column values) is kept.
o1 o2 o3
1 1 NA NA
2 2 NA NA
3 3 NA NA
4 4 NA NA
5 6 NA NA
6 7 NA NA
7 5 9 NA # this row has only 2 values which match values from row 11 but the last value is na
8 10 NA NA
9 12 NA NA
10 13 NA NA
11 5 9 14 # this row has values in all 3 columns
12 14 NA NA
13 8 11 15 # so does this row
14 16 NA NA
15 17 NA NA
16 18 NA NA
17 19 NA NA
18 20 NA NA
The result should be the same data frame - just without row 7 or where row 7 is overwritten by row 11.
This should be easy to do but for some reason i didn't manage it (except with a convoluted for loop that is hard to generalize should more columns be added at a later time). Is there a straight forward way to do this?
dput of above df:
structure(list(o1 = c(1L, 2L, 3L, 4L, 6L, 7L, 5L, 10L, 12L, 13L,
5L, 14L, 8L, 16L, 17L, 18L, 19L, 20L), o2 = c(NA, NA, NA, NA,
NA, NA, 9L, NA, NA, NA, 9L, NA, 11L, NA, NA, NA, NA, NA), o3 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 14L, NA, 15L, NA, NA, NA,
NA, NA)), row.names = c(NA, -18L), class = "data.frame")
If there is already an answer for something like this, please let me know.
I thought of using dplyr:
library(dplyr)
df %>%
mutate(rn = row_number(),
count_na = rowSums(across(o1:o3, is.na))) %>%
group_by(o1, o2) %>%
slice_min(count_na) %>%
arrange(rn) %>%
ungroup() %>%
select(o1:o3)
This returns
# A tibble: 17 x 3
o1 o2 o3
<int> <int> <int>
1 1 NA NA
2 2 NA NA
3 3 NA NA
4 4 NA NA
5 6 NA NA
6 7 NA NA
7 10 NA NA
8 12 NA NA
9 13 NA NA
10 5 9 14
11 14 NA NA
12 8 11 15
13 16 NA NA
14 17 NA NA
15 18 NA NA
16 19 NA NA
17 20 NA NA
This solution is based on the following ideas:
For every row we count the number of NAs in this row.
We group for o1 and o2 to create groups of data that belong together. Here is a possible flaw: perhaps it is a better approach to group by o1 only or do some other grouping. This depends on the structure of your data: should 1, <NA>, <NA> be overwritten by 1, 2, <NA>?
After grouping, we select the row with the smallest number of NAs.
Finally we do some clean up: removing the auxiliary columns, arranging the data and ungrouping.
A partial solution to detect the duplicates, it remains to specify which rows to delete, ran out of time. I've went ahead and "duplicated" a couple more rows.
df=read.table(text="
o1 o2 o3
1 1 NA NA
2 2 NA NA
3 3 NA NA
4 4 NA NA
5 6 NA NA
6 7 NA NA
7 5 9 NA
8 10 NA NA
9 12 NA NA
10 13 NA NA
11 5 9 14
12 14 NA NA
13 8 11 15
14 16 NA NA
15 7 1 2
16 18 NA NA
17 7 1 3
18 20 NA NA",h=T)
The main trick is to calculate a distance matrix and check which rows have a distance of zero, since dist will automatically estimate a pairwise distance, removing missing values.
tmp=as.matrix(dist(df))
diag(tmp)=NA
tmp[lower.tri(tmp)]=NA
tod=data.frame(which(tmp==0,arr.ind=T))
resulting in
row col
X7 7 11
X6 6 15
X6.1 6 17
Here's another way which considers all columns, should work with any number of columns and regardless of their names or positions
library(dplyr)
mydf <- structure(list(o1 = c(1L, 2L, 3L, 4L, 6L, 7L, 5L, 10L, 12L, 13L,
5L, 14L, 8L, 16L, 17L, 18L, 19L, 20L),
o2 = c(NA, NA, NA, NA,
NA, NA, 9L, NA, NA, NA, 9L, NA, 11L, NA, NA, NA, NA, NA),
o3 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 14L, NA, 15L, NA, NA, NA,
NA, NA)),
row.names = c(NA, -18L),
class = "data.frame")
columns <- names(mydf)
dummy_cols <- paste0(columns, "_dummy")
mydf %>%
# duplicate the dataframe
cbind(mydf %>% `names<-`(dummy_cols)) %>%
# arrange across all columns
arrange(across(columns)) %>%
# fill NAs downwards
tidyr::fill(dummy_cols, .direction = "down") %>%
# create a dummy ID
tidyr::unite(id_dummy, dummy_cols, sep = "") %>%
# group by the id
group_by(id_dummy) %>%
# get the first row of each
filter(row_number()==1) %>%
ungroup() %>%
select(columns)
P.S. also replaces 1 - NA - NA with 1 - 2 - NA and replaces 1 - NA - NA with 1 - NA - 3
I have a data frame that looks like the following:
Year Day ID V1 V2 ....
2003 35 1102 3 6
2003 35 1103 5 NA
2003 35 1104 8 100
.....
2003 40 1102 NA 8
2003 40 1103 NA 10
2003 40 1104 9 NA
.....
.....
2018 49 1104 5 NA
.....
2018 50 1102 3 6
2018 50 1103 7 NA
2018 50 1104 NA 100
I would like to build a data frame that extracts, for each combination of Year and ID, the the latest (high value per the Day column) non-NA value in V1, V2... Based on the above data set, for Year = 2018 and ID = 1104, I would like to extract V1 = 5 (on Day = 49) and V2 = 100 (on Day = 50). If all values for that Year and ID combination are NA then I would like it to return NA.
We can create a function which gives us the latest non-NA value based on Day for each Vn column
get_last_non_NA_value <- function(x) {
x[which.max(cumsum(!is.na(x)))]
}
and then apply that function for each Year and ID
library(dplyr)
df %>%
group_by(Year, ID) %>%
summarise_at(vars(V1:V2), funs(get_last_non_NA_value(.[order(Day)])))
# Year ID V1 V2
# <int> <int> <int> <int>
#1 2003 1102 3 8
#2 2003 1103 5 10
#3 2003 1104 9 100
#4 2018 1102 3 6
#5 2018 1103 7 NA
#6 2018 1104 5 100
EDIT
If we also want to extract corresponding Day for each value, we can change the function to return both values as comma-separated string
get_last_non_NA_value <- function(x, y) {
ind <- which.max(cumsum(!is.na(x[order(y)])))
paste(x[ind], y[ind], sep = ",")
}
and then use cSplit to separate these comma separated values into different columns.
library(dplyr)
library(splitstackshape)
cols <- c("V1", "V2")
df %>%
group_by(Year, ID) %>%
summarise_at(cols, funs(get_last_non_NA_value(., Day))) %>%
cSplit(cols) %>%
rename_at(vars(contains("_1")), funs(sub("_1", "_last_value", .))) %>%
rename_at(vars(contains("_2")), funs(sub("_2", "_days", .)))
# Year ID V1_last_value V1_days V2_last_value V2_days
#1: 2003 1102 3 35 8 40
#2: 2003 1103 5 35 10 40
#3: 2003 1104 9 40 100 35
#4: 2018 1102 3 50 6 50
#5: 2018 1103 7 50 NA 50
#6: 2018 1104 5 49 100 50
Note that rename_at part renames the columns for better understanding of what value it holds, you can skip that part if you are not interested in renaming columns.
data
df <- structure(list(Year = c(2003L, 2003L, 2003L, 2003L, 2003L, 2003L,
2018L, 2018L, 2018L, 2018L), Day = c(35L, 35L, 35L, 40L, 40L,
40L, 49L, 50L, 50L, 50L), ID = c(1102L, 1103L, 1104L, 1102L,
1103L, 1104L, 1104L, 1102L, 1103L, 1104L), V1 = c(3L, 5L, 8L,
NA, NA, 9L, 5L, 3L, 7L, NA), V2 = c(6L, NA, 100L, 8L, 10L, NA,
NA, 6L, NA, 100L)), .Names = c("Year", "Day", "ID", "V1", "V2"
), class = "data.frame", row.names = c(NA, -10L))
You can use dplyr
Assuming you want max for V1 and V2
library(dplyr)
df %>%
group_by(Year, ID) %>%
summarise(Day = max(Day, na.rm = TRUE),
V1 = max(V1, na.rm = TRUE),
V2 = max(V2, na.rm = TRUE))
If for V1 and V2, you want first non-NA then
df %>%
group_by(Year, ID) %>%
summarise(Day = max(Day, na.rm = TRUE),
V1 = first(setdiff(V1, NA)),
V2 = first(setdiff(V1, NA)))
I have a dataset on this form:
set.seed(4561) # Make the results reproducible
df=data.frame(
colour=rep(c("green","red","blue"),each=3),
year=rep("2017",9),
month=rep(c(1,2,3),3),
price=c(200,254,188,450,434,490,100,99,97),
work=ceiling(runif(9,30,60)),
gain=ceiling(runif(9,1,10)),
work_weighed_price=NA,
gain_weighed_price=NA
)
For each colour, year, month I have a price (output variable) and two input variables called gain and work. In reality I have many more input variables, but this suffices to show what I desire to do with my dataframe.
> df
colour year month price work gain work_weighed_price gain_weighed_price
1 green 2017 1 200 33 9 NA NA
2 green 2017 2 254 56 5 NA NA
3 green 2017 3 188 42 8 NA NA
4 red 2017 1 450 39 3 NA NA
5 red 2017 2 434 45 2 NA NA
6 red 2017 3 490 36 8 NA NA
7 blue 2017 1 100 50 8 NA NA
8 blue 2017 2 99 45 8 NA NA
9 blue 2017 3 97 56 4 NA NA
I wish to calculate the weighted gain and work (and also the weighted price), where the weight is the price for that month and year, divided by the sum of price across colours:
desired_output=data.frame(
year=rep("2017",3),
month=rep(c(1,2,3),1),
price=c(200*(200/(200+450+100))+450*(450/(200+450+100))+100*(100/(200+450+100)),
254*(254/(254+434+99))+434*(434/(254+434+99))+99*(99/(254+434+99)),
188*(188/(188+490+97))+490*(490/(188+490+97))+97*(97/(188+490+97))),
work_weighed_price=c(47*(200/(200+450+100))+44*(450/(200+450+100))+52*(100/(200+450+100)),
44*(254/(254+434+99))+42*(434/(254+434+99))+32*(99/(254+434+99)),
38*(188/(188+490+97))+52*(490/(188+490+97))+52*(97/(188+490+97))) ,
gain_weighed_price=c(5*(200/(200+450+100))+8*(450/(200+450+100))+10*(100/(200+450+100)),
3*(254/(254+434+99))+7*(434/(254+434+99))+9*(99/(254+434+99)),
2*(188/(188+490+97))+4*(490/(188+490+97))+9*(97/(188+490+97)))
)
> desired_output
year month price work_weighed_price gain_weighed_price
1 2017 1 336.6667 45.86667 7.466667
2 2017 2 333.7649 41.38755 5.960610
3 2017 3 367.5523 48.60387 4.140645
How would I attack this in R?
You can use the weighted.mean function
df %>%
group_by(year, month) %>%
summarise_at(vars(price, work, gain),
funs(price_weighted = weighted.mean(., price)))
# # A tibble: 3 x 5
# # Groups: year [?]
# year month price_price_weighted work_price_weighted gain_price_weighted
# <int> <int> <dbl> <dbl> <dbl>
# 1 2017 1 337 45.9 7.47
# 2 2017 2 334 41.4 5.96
# 3 2017 3 368 48.6 4.14
Or, in data.table
library(data.table)
setDT(df)
df[, lapply(.SD, weighted.mean, price)
, .SDcols = c('price', 'work', 'gain')
, by = .(year, month)]
# year month price work gain
# 1: 2017 1 336.6667 45.86667 7.466667
# 2: 2017 2 333.7649 41.38755 5.960610
# 3: 2017 3 367.5523 48.60387 4.140645
An approach using dplyr. Your use of runif in your example df without setting seed and the fact that it doesn't line up with your desired output is causing some confusion. In the code below, I use a df that's consistent with your desired output.
library(dplyr)
df %>%
group_by(year, month) %>%
mutate(weight = price / sum(price)) %>%
mutate_at(vars(price, work, gain), funs(weighed_price = . * weight)) %>%
summarise_at(vars(ends_with("weighed_price")), sum)
# # A tibble: 3 x 5
# # Groups: year [?]
# year month work_weighed_price gain_weighed_price price_weighed_price
# <int> <int> <dbl> <dbl> <dbl>
# 1 2017 1 45.9 7.47 337.
# 2 2017 2 41.4 5.96 334.
# 3 2017 3 48.6 4.14 368.
df:
structure(list(colour = c("green", "green", "green", "red", "red",
"red", "blue", "blue", "blue"), year = c(2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L), month = c(1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L), price = c(200L, 254L, 188L, 450L,
434L, 490L, 100L, 99L, 97L), work = c(47L, 44L, 38L, 44L, 42L,
52L, 52L, 32L, 52L), gain = c(5L, 3L, 2L, 8L, 7L, 4L, 10L, 9L,
9L), work_weighed_price = c(NA, NA, NA, NA, NA, NA, NA, NA, NA
), gain_weighed_price = c(NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("colour",
"year", "month", "price", "work", "gain", "work_weighed_price",
"gain_weighed_price"), class = "data.frame", row.names = c(NA,
-9L))
A base R solution could be the following sequence of tapply instructions.
fun_price <- function(x){
s <- sum(x)
sum(x*(x/s))
}
fun_weighted <- function(x, w){
s <- sum(w)
sum(x*(w/s))
}
desired <- data.frame(year = unique(df$year), month = sort(unique(df$month)))
desired$price <- with(df, tapply(price, month, FUN = fun_price))
desired$work_weighed_price <- with(df, tapply(work, month, FUN = fun_weighted, w = price))
desired$gain_weighed_price <- with(df, tapply(gain, month, FUN = fun_weighted, w = price))
desired
# year month price work_weighed_price gain_weighed_price
#1 2017 1 336.6667 40.74092 6.622405
#2 2017 2 333.7649 48.56834 4.984429
#3 2017 3 367.5523 44.65052 6.659170