i am having hard time with this one...So i am trying to find points per group that are close to each other and furthermore group them. Let me explain You on base of the example data below:
Group X Y Z
1 110 3762 431 10
2 112 4950 880 10
3 113 5062 873 20
4 113 5225 874 30
5 113 5262 875 10
6 113 5300 874 20
structure(list(Group = c(110, 112, 113, 113, 113, 113), X = c(3762,
4950, 5062, 5225, 5262, 5300), Y = c(431, 880, 873, 874, 875,
874), Z = c(10, 10, 20, 30, 10, 20)), row.names = c(NA, -6L), class = "data.frame")
As we can see we have grouping column Group, X & Y Columns are our coordinates and Z Column should be further summarised when points are defined as "Close" (Euclidean distance < 100).
What i have tried:
I have calculated sucesfully Euclidean distance between points using this function:
for(i in 1:nrow(test)) {
if(i > 1 && test$Group[i] == test$Group[i-1]) {
test$Distance[i] <- sqrt(((test$X[i] - test$X[i-1]) ^ 2) + ((test$Y[i] - test$Y[i-1]) ^ 2))
} else {
test$Distance[i] <- NA
}
}
Which gives me this:
Group X Y Z Distance
1 110 3762 431 10 NA
2 112 4950 880 10 NA
3 113 5062 873 20 NA
4 113 5225 874 30 163.00307
5 113 5262 875 10 37.01351
6 113 5300 874 20 38.01316
And here everything complicates as there are NA´s for the first row for each Group etc....
What i wanna achieve:
I would like to find points per goup that their distance is not greater then 100 (Distance < 100), and on base of that summarise it (simple sum of Z column). So manually done:
Group Z Grouped
1 110 10 no
2 112 10 no
3 113 20 no
4 113 60 yes
Thanks for help!
That was difficult. I'm not sure I have figured it out completely.
#get data and libraries
library(tidyverse)
df <- read.table(text = "
Group X Y Z Distance
1 110 3762 431 10 NA
2 112 4950 880 10 NA
3 113 5062 873 20 NA
4 113 5225 874 30 163.00307
5 113 5262 875 10 37.01351
6 113 5300 874 20 38.01316", header = T, stringsAsFactors = F)
df %>%
group_by(Group) %>%
do(melt(outer(.$Distance, .$Distance, `-`))) %>%
filter(between(value, -100, 0) | between(value, 0, 100)) %>%
distinct(Var1) %>%
mutate(grouped = 1) %>%
rename(row = Var1) -> rows
df %>%
group_by(Group) %>%
mutate(row = row_number()) %>%
left_join(rows, by = c("row", "Group")) %>%
mutate(grouped = ifelse(is.na(grouped), "no", "yes")) %>%
group_by(Group, grouped) %>%
mutate(Z = ifelse(!is.na(grouped), sum(Z), Z)) %>%
distinct(Group, Z, grouped)
# A tibble: 4 x 3
# Groups: Group, grouped [4]
Group Z grouped
<int> <int> <chr>
1 110 10 no
2 112 10 no
3 113 20 no
4 113 60 yes
Hope it's what you were looking for, if not maybe it gave you some new ideas.
UPDATE: And now what I hope will really help you:
df %>%
group_by(Group) %>%
mutate(int1 = lead(Distance) < 100 | Distance < 100,
int1 = replace(int1, is.na(int1), FALSE),
int2 = rleid(int1),
int2 = replace(int2, !int1 | is.na(int1), NA)) -> df2
df2 %>%
filter(int1) %>%
group_by(Group, int2) %>%
summarise(Z = sum(Z),
Grouped = "yes") %>%
select(Group, Z, Grouped) %>%
bind_rows(df2 %>%
filter(!int1) %>%
mutate(Grouped = "no") %>%
select(Group, Z, Grouped)) %>%
arrange(Group)
# A tibble: 4 x 3
# Groups: Group [3]
Group Z Grouped
<int> <int> <chr>
1 110 10 no
2 112 10 no
3 113 60 yes
4 113 20 no
I worked out a little use case that can get you started. It is a base approach using a for loop and aggregation based on vector of columns to which you could apply a paired vector of functions by which to aggregate.
df <- read.table(text = "
Group X Y Z Distance
1 110 3762 431 10 NA
2 112 4950 880 10 NA
3 113 5062 873 20 NA
4 113 5225 874 30 163.00307
5 113 5262 875 10 37.01351
6 113 5300 874 20 38.01316
7 114 5300 874 30 NA
8 114 5300 874 20 38.01316", header = T, stringsAsFactors = F)
aggregateIt <- function(df = data, #data.frame
returnRaw = F, #to get the raw unaggregted df (only first case from column `grouped` by `subgroup` usable in this application)
colsToAgg = c("Z1", "Z2", "Z3"), #cols to aggregate
how = c("sum", "sum", "max")) #how to aggregate the columns, `Z1` by sum, `Z2` by sum and `Z3` by max
{
count <- 1L
result <- vector("integer", nrow(df))
grouped <- vector("character", nrow(df))
for(i in seq_len(length(result)-1L)){
if(df$Group[i] != df$Group[i+1L]) {
result[i] <- count
grouped[i] <- "no"
count <- count + 1L
if((i+1L) == length(result)) {
result[i+1L] <- count
grouped[i+1L] <- "no"
}
} else {
if(df$Distance[i+1L] > 100L) {
result[i] <- count
grouped[i] <- "no"
count <- count + 1L
if((i+1L) == length(result)) {
result[i+1L] <- count
grouped[i+1L] <- "no"
}
} else {
result[i] <- count
grouped[i] <- "yes"
if((i+1L) == length(result)) {
result[i+1L] <- count
grouped[i+1L] <- "yes"
}
}
}
}
df <- within(df, {subgroup <- result; grouped <- grouped})
if(returnRaw) return(df)
A <- Reduce(function(a, b) merge(a, b, by = "subgroup"),
lapply(seq_along(how), function(x) aggregate(.~subgroup, df[, c(colsToAgg[x], "subgroup")], how[x])))
B <- df[!duplicated(df$subgroup, fromLast = F), c("Group", "subgroup", "grouped")]
out <- merge(A, B, by = "subgroup")
return(out[, c("Group", colsToAgg, "grouped")])
}
aggregateIt(df = df, colsToAgg = "Z", how = "sum")
# Group Z grouped
#1 110 10 no
#2 112 10 no
#3 113 20 no
#4 113 60 yes
#5 114 50 yes
Not claiming this is most efficient solution but it points out the solution. Hope this helps!
Related
My current table looks like this:
Region
Diabetes
percentage
lower limit
upper limit
N
1
0
85
80
90
100
1
1
15
10
16
500
2
0
90
80
97
198
2
1
10
7
20
134
3
0
97
90
99
434
3
1
3
0
10
283
This is the code I used to create that table.
CIregion_prop <- dta %>%
filter(!is.na(Diabetes)) %>%
filter(!is.na(region)) %>%
group_by(region) %>%
count(diabetes) %>%
mutate(perc =prop.table(n)*100,
lower = lapply(n, prop.test, n = sum(n)),
upper = sapply(lower, function(x) x$conf.int[2])*100,
lower = sapply(lower, function(x) x$conf.int[1])*100)
I want to transform the table to below. Organized by how many people are positive and negative in each region to look like this:
Diabetes
percentage
lower limit
upper limit
N
0
85
80
90
732
1
15
10
16
917
How can I transform my above code?
Try using dplyr::select() to remove the region data and omit the group_by() step:
library(dplyr)
region <- sample(c(1,2,3), 1649, replace = T)
Diabetes <- sample(c(0,1), 1649, replace = T)
df <- data.frame(region, Diabetes)
CI.no.region_prop <- df %>%
filter(!is.na(Diabetes)) %>%
filter(!is.na(region)) %>%
dplyr::select(Diabetes) %>%
#group_by(region) %>%
count(Diabetes) %>%
mutate(perc = prop.table(n)*100,
lower = lapply(n, prop.test, n = sum(n)),
upper = sapply(lower, function(x) x$conf.int[2])*100,
lower = sapply(lower, function(x) x$conf.int[1])*100)
I have the sales and cost data by models. The code below select TOP10 models by sales and all the rest are sum up in new category "Others" which is 11th row.
df <- data.frame (model = c("A","B","C","D","E","F","G","H","I","J","K","L","M","N"),
sale = c(100,300,140,456,345,456,456,780,40,560,560,456,350,500),
cost = c(1340,330,440,443,365,437,478,700,30,460,160,456,650,100))
#TOP10 by sale
order <- df %>%
type.convert(as.is = TRUE) %>%
mutate(pos = row_number(desc(sale)),
model = ifelse(pos>10, 'Others', model),
pos = ifelse(pos>10, 11, pos)) %>%
group_by(model, pos) %>%
summarise(cost= sum(cost), sale= sum(sale), .groups = 'drop') %>%
arrange(pos)
Output:
model pos cost sale
1 H 1 700 780
2 J 2 460 560
3 K 3 160 560
4 N 4 100 500
5 D 5 443 456
6 F 6 437 456
7 G 7 478 456
8 L 8 456 456
9 M 9 650 350
10 E 10 365 345
11 Others 11 2140 580
The sale of category A is put in "other" since it has low a sale (100) and is not in TOP10.
Now, I want to include A in this TOP 10 in any case, no matter how much sale it has. So output should be TOP9 + 'A' + 'Others':
Expected output:
model pos cost sale
1 H 1 700 780
2 J 2 460 560
3 K 3 160 560
4 N 4 100 500
5 D 5 443 456
6 F 6 437 456
7 G 7 478 456
8 L 8 456 456
9 M 9 650 350
10 A 10 100 1340
11 Others 11 2140 580
(Thus change must be done in the given code)
To be able to automate it, I created a function for you. You can easily specify the parameters and get the output quickly. Also, this function can be modified at a later stage based on your requirements:
Top10BySales = function(DataFrame,TopN=10, IncludeModels) {
## Dataframe -> User needs to specify the dataframe
## TopN -> Do you want to limit the analysis by top 10 or change it? Default value is 10
## IncludeModels -> Here you will specify which models you want to include
## Ranking the models based on sales
df1 = DataFrame %>%
arrange(desc(sale)) %>%
mutate(Ranking = 1:nrow(DataFrame),
Include = ifelse(Ranking<=TopN, model, "Other"))
## Grouping the models
df2 = df1 %>%
mutate(Ranking = ifelse(Include!="Other", Ranking, 10+1)) %>%
group_by(Include,Ranking) %>%
summarise(cost= sum(cost), sale= sum(sale), .groups = 'drop')
## Checking the length of IncludeModelsVector
if (length(IncludeModels) == 0) {
df2=df2
} else {
df3 = data.frame(ModelNames = IncludeModels)
df3$Inclusion = ifelse(df3$ModelNames %in% df1$Include, "Yes", "No")
df3 = df3 %>% filter(Inclusion=="No")
df_original = df1 %>% filter(model %in% df3$ModelNames)
df_original$Num = 1:nrow(df_original)
for (i in 1:nrow(df_original)) {
df2[nrow(df2)-df_original[i,]$Num,] = df_original[i,c(1,4,3,2)] ## Replacing the values
}
}
return(df2 %>% arrange(Ranking))
}
Using the function
To use this function, please see the picture below:
Hope this helps!
One way could be making use of bind_rows after removing the 10th line and adding only where model == A:
library(tidyverse)
#TOP10 by sale
df %>%
type.convert(as.is = TRUE) %>%
mutate(pos = row_number(desc(sale)),
model = ifelse(pos>10, 'Others', model),
pos = ifelse(pos>10, 11, pos)) %>%
group_by(model, pos) %>%
summarise(cost= sum(cost), sale= sum(sale), .groups = 'drop') %>%
arrange(pos) %>%
slice(-10) %>%
bind_rows(df %>%
filter(model == "A")) %>%
mutate(pos = replace_na(pos, 10)) %>%
arrange(pos)
model pos cost sale
<chr> <dbl> <dbl> <dbl>
1 H 1 700 780
2 J 2 460 560
3 K 3 160 560
4 N 4 100 500
5 D 5 443 456
6 F 6 437 456
7 G 7 478 456
8 L 8 456 456
9 M 9 650 350
10 A 10 1340 100
11 Others 11 2140 580
You can mutate in 2 steps, before grouping and manipulate the pos variable to fix the order. This solves the problem in the comments to the other answer.
order <- df %>%
type.convert(as.is = TRUE) %>%
mutate(pos = row_number(desc(sale))) %>%
mutate(pos = ifelse(model == "A" & pos > 10, 11, ifelse(pos > 10, 12, pos)),
model = ifelse(pos>11, 'Others', model)) %>%
group_by(model, pos) %>%
summarise(cost= sum(cost), sale= sum(sale), .groups = 'drop') %>%
arrange(pos)
Note: This question was closed as a 'duplicate'. The solutions offered here and here did not answer my question. They showed how to merge when a single entry fell within a range, I'm trying to identify overlapping ranges and joining them. Perhaps my title could have been better...
I have a main data set main_df with a start and end time (in seconds). I would like to see if the time range in main_df falls within a list of ranges in lookup_df, and if so, grab the value from lookup_df. Additionally, if the main_df falls within two different lookup ranges, duplicate the row so each value is represented.***
main_df <- tibble(start = c(30,124,161),
end = c(80,152,185))
lookup_df <- tibble(start = c(34,73,126,141,174,221),
end = c(69,123,136,157,189,267),
value = c('a','b','b','b','b','a'))
# Do something here to get the following:
> final_df
# A tibble: 4 x 4
start end value notes
<dbl> <dbl> <chr> <chr>
1 30 80 a ""
2 30 80 b "Duplicate because it falls within a and b"
3 124 152 b "Falls within two lookups but both are b"
4 161 185 b ""
***Edit: Looking at the way I've structured the problem...
#Not actual code
left_join(main_df, lookup_df, by(some_range_join_function) %>%
add_rows(through_some_means)
Rather than having to add a new row I could flip how I'm joining them...
semi_join(lookup_df, main_df, by(some_range_join_function))
You could do some logical comparisons and then a case handling what shall happen if all are 'b', 'a' and 'b', etc. In this way you easily could add more cases, e.g. both are 'a', one is 'a', more are 'b' which you didn't declare in OP. The approach yields NULL if there are no matches which gets omitted during rbind.
f <- \(x, y) {
w <- which((x[1] >= y[, 1] & x[1] <= y[, 2]) | (x[2] >= y[, 1] & x[1] <= y[, 2]))
if (length(w) > 0) {
d <- data.frame(t(x), value=cbind(y[w, 3]), notes='')
if (length(w) >= 2) {
if (all(d$value == 'b')) {
d <- d[!duplicated(d$value), ]
d$notes[1] <- 'both b'
}
else {
d$notes[nrow(d)] <- 'a & b'
}
}
d
}
}
apply(main_df, 1, f, lookup_df, simplify=F) |> do.call(what=rbind)
# start end value notes
# 1 30 80 a
# 2 30 80 b a & b
# 3 124 152 b both b
# 4 161 185 b
Data:
main_df <- structure(list(start = c(2, 30, 124, 161), end = c(1, 80, 152,
185)), row.names = c(NA, -4L), class = "data.frame")
lookup_df <- structure(list(start = c(34, 73, 126, 141, 174, 221), end = c(69,
123, 136, 157, 189, 267), value = c("a", "b", "b", "b", "b",
"a")), row.names = c(NA, -6L), class = "data.frame")
Another option is fuzzyjoin::interval_join:
library(fuzzyjoin)
library(dplyr)
interval_join(main_df, lookup_df, by = c("start", "end"), mode = "inner") %>%
group_by(value, start.x, end.x) %>%
slice(1) %>%
select(start = start.x, end = end.x, value)
# A tibble: 4 × 3
# Groups: value, start, end [4]
start end value
<dbl> <dbl> <chr>
1 30 80 a
2 30 80 b
3 124 152 b
4 161 185 b
You can use foverlaps from data.table for this.
library(data.table)
setDT(main_df) # make it a data.table if needed
setDT(lookup_df) # make it a data.table if needed
setkey(main_df, start, end) # set the keys of 'y'
foverlaps(lookup_df, main_df, nomatch = NULL) # do the lookup
# start end i.start i.end value
# 1: 30 80 34 69 a
# 2: 30 80 73 123 b
# 3: 124 152 126 136 b
# 4: 124 152 141 157 b
# 5: 161 185 174 189 b
Or to get the cleaned results as end result (OP's final_df)
unique(foverlaps(lookup_df, main_df, nomatch = NULL)[, .(start, end, value)])
start end value
1: 30 80 a
2: 30 80 b
3: 124 152 b
4: 161 185 b
A possible solution, based on powerjoin:
library(tidyverse)
library(powerjoin)
power_left_join(
main_df, lookup_df,
by = ~ (.x$start <= .y$start & .x$end >= .y$end) |
(.x$start >= .y$start & .x$start <= .y$end) |
(.x$start <= .y$start & .x$end >= .y$start),
keep = "left") %>%
distinct()
#> # A tibble: 4 x 3
#> start end value
#> <dbl> <dbl> <chr>
#> 1 30 80 a
#> 2 30 80 b
#> 3 124 152 b
#> 4 161 185 b
Or using tidyr::crossing:
library(tidyverse)
crossing(main_df, lookup_df,
.name_repair = ~ c("start", "end", "start2", "end2", "value")) %>%
filter((start <= start2 & end >= end2) |
(start >= start2 & start <= end2) | (start <= start2 & end >= start2)) %>%
select(-start2, -end2) %>%
distinct()
#> # A tibble: 4 x 3
#> start end value
#> <dbl> <dbl> <chr>
#> 1 30 80 a
#> 2 30 80 b
#> 3 124 152 b
#> 4 161 185 b
You can use the fuzzyjoin package to join based on intervals with the fuzzyjoin::interval_*_join() functions.
I'll be using an inner join, because if you use a semi join like you propose, you will loose the value col and get just 3 rows.
library(tidyverse)
library(fuzzyjoin)
fuzzyjoin::interval_inner_join(lookup_df, main_df, by = c("start", "end"), type = "any")
#> # A tibble: 5 × 5
#> start.x end.x value start.y end.y
#> <dbl> <dbl> <chr> <dbl> <dbl>
#> 1 34 69 a 30 80
#> 2 73 123 b 30 80
#> 3 126 136 b 124 152
#> 4 141 157 b 124 152
#> 5 174 189 b 161 185
As you can see, the fuzzy_inner_join() preserves the by cols from both tables, since they are not the same in a fuzzy join. Also, we still have separate rows for those cases in main_df that match multiple cases in lookup_df. Thus, we do some cleanup of the joined table:
interval_inner_join(lookup_df, main_df,
by = c("start", "end"),
type = "any") |>
select(-ends_with(".x")) |> # remove lookup interval cols
distinct() |> # remove duplicate
rename_with(str_remove, ends_with(".y"), "\\.y") # remove suffixes from col names
#> # A tibble: 4 × 3
#> value start end
#> <chr> <dbl> <dbl>
#> 1 a 30 80
#> 2 b 30 80
#> 3 b 124 152
#> 4 b 161 185
Finally, a clarification of terminology: In your question you state you want to join based on the interval from main_df falling within the interval from lookup_df. This is possible by using type = "within" in interval_*_join(). But based on the examples you provide, it appears you want to join based on any overlap. This can be done with type = "any", but it is the default, so you don't need to specify it.
I would like to remove rows where the value of a particular variable, HEIGHT_CM is very different from the other values for the same id, PATIENT_ID.
Difference of >2 from the other values
Data:
df <- read.table(text = "PATIENT_ID MEASUREMENT_TAKEN_DATE HEIGHT_CM
1 20140305 163
1 20140409 163
1 20140528 164
1 20140730 164
1 20141210 99.7
2 20140305 155
2 20140527 157
2 20141111 78
3 20140721 90
4 20140528 168
4 20140627 167
4 20140917 167
4 20141002 70", header = TRUE, stringsAsFactors = FALSE)
Desired Output:
df <- read.table(text = "PATIENT_ID MEASUREMENT_TAKEN_DATE HEIGHT_CM
1 20140305 163
1 20140409 163
1 20140528 164
1 20140730 164
2 20140305 155
2 20140527 157
3 20140721 90
4 20140528 168
4 20140627 167
4 20140917 167", header = TRUE, stringsAsFactors = FALSE)
library(dplyr)
df <- df %>%
group_by(PATIENT_ID) %>%
mutate(difference = abs(mean(HEIGHT_CM, na.rm = T) - HEIGHT_CM)) %>%
filter(difference <= 50)
Here you filter for difference >= 50, where difference is between the mean of the group and the single value. This gets the output you showed us.
A statistical way to remove outliers that gives your exact result :
library(dplyr)
df1 <- df %>% group_by(PATIENT_ID) %>% summarize(s=sd(HEIGHT_CM),m=mean(HEIGHT_CM))
df %>% inner_join(df1,by="PATIENT_ID") %>%
filter(is.na(s)|((HEIGHT_CM>m-s)&(HEIGHT_CM<m+s))) %>% select(-c(4,5))
I have a data.frame with coordinates and group information like this:
set.seed(1)
df = data.frame(x=round(runif(6,1,100)), y=round(runif(6,100,200)), group=c("A", "A", "B", "B", "B", "A"))
I want to create a "differences" data.frame, from all combinations of 2 points and compute the difference in X coordinates in first column, in Y in the second column. I came up with this code that is absolutely not efficient imo:
comp.diff = function(H, data) {(data[H[1], 1:2]- data[H[2], 1:2])}
comb = df %>% nrow %>% combn(2) %>% {cbind(., .[2:1, ])} # make all combinations in both ways
apply(comb, 2, comp.diff, data = df) %>% do.call('rbind.data.frame', .)
But I can't achieve two more things:
I want to compute (or keep) only the differences of couples of point taken from the same group
I would like to keep, for each row in the output matrix, the information about the initial x, the initial y, and the id of the group concerned
How can I achieve this in an efficient way (obviously, number of combinations grow fast with N...)
Thanks
Structure of expected output (extract):
#### delta.x delta.y old.x old.y group
#### 1 -11 28 27 166 A
#### 5 -63 76 27 118 A
#### ...
You can try
library(tidyverse)
# calculate the combinations per group
combs <- df %>%
split(.$group) %>%
map(~combn(1:nrow(.),2))
# the calcualtion
df %>%
mutate(index=1:n()) %>%
split(.$group) %>%
map2(combs, ., ~data.frame(t(apply(.x, 2, function(i)
cbind(paste(.y$index[i], collapse = "-"),
.y$x[i[1]],.y$x[i[2]],.y$y[i[1]],.y$y[i[2]],
-diff(.y$x[i]), -diff(.y$y[i])))),stringsAsFactors = F)) %>%
bind_rows(.id = "group") %>%
dplyr::select(1, index_diff=2,
x1_old=3, x2_old=4,
y1_old=5, y2_old=6,
diff_x=7,diff_y=8)
Edit
and alltogether in one pipe including transformation to integers
df %>%
mutate(index=1:n()) %>%
split(.$group) %>%
map(~data.frame(t(apply(combn(1:nrow(.),2), 2, function(i)
cbind(paste(.$index[i], collapse = "-"),
.$x[i[1]],.$x[i[2]],.$y[i[1]],.$y[i[2]],
-diff(.$x[i]), -diff(.$y[i])))),stringsAsFactors = F)) %>%
bind_rows(.id = "group") %>%
dplyr::select(1, index_diff=2,
x1_old=3, x2_old=4,
y1_old=5, y2_old=6,
diff_x=7,diff_y=8) %>%
mutate_at(vars(x1_old:diff_y), as.numeric) %>%
as.tibble()
# A tibble: 6 x 8
group index_diff x1_old x2_old y1_old y2_old diff_x diff_y
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1-2 27 38 194 166 -11 28
2 A 1-6 27 90 194 118 -63 76
3 A 2-6 38 90 166 118 -52 48
4 B 3-4 58 91 163 106 -33 57
5 B 3-5 58 21 163 121 37 42
6 B 4-5 91 21 106 121 70 -15