Rescaling by group across data frames

Rescaling by group across data frames - r

I have two data frames
df1 <- structure(list(g1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), g2 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"), val1 = 1:20, val2 = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 4L, 1L, 2L, 3L)), .Names = c("g1", "g2", "val1", "val2"), row.names = c(NA, -20L), class = "data.frame")
df2 <- structure(list(g1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), g2 = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"), val3 = c(5L, 6L, 7L, 3L, 4L, 5L, 2L, 3L, 4L, 8L, 9L, 10L, 4L, 5L, 6L, 5L, 6L)), .Names = c("g1", "g2", "val3"), row.names = c(NA, -17L), class = "data.frame")
> df1
g1 g2 val1 val2
1 A a 1 1
2 A a 2 2
3 A a 3 3
4 A a 4 4
5 A b 5 1
6 A b 6 2
7 A b 7 3
8 A c 8 1
9 A c 9 2
10 A c 10 3
11 B a 11 1
12 B a 12 2
13 B a 13 3
14 B b 14 1
15 B b 15 2
16 B b 16 3
17 B b 17 4
18 B c 18 1
19 B c 19 2
20 B c 20 3
> df2
g1 g2 val3
1 A a 5
2 A a 6
3 A a 7
4 A b 3
5 A b 4
6 A b 5
7 A c 2
8 A c 3
9 B c 4
10 B a 8
11 B a 9
12 B a 10
13 B b 4
14 B b 5
15 B b 6
16 B c 5
17 B c 6
My aim is to rescale df1$val2 to take values between the min and max values of df2$val3 within the respective groups.
I tried this:
library(dplyr)
df1 <- df1 %.% group_by(g1, g2) %.% mutate(rescaled=(max(df2$val3)-min(df2$val3))*(val2-min(val2))/(max(val2)-min(val2))+min(df2$val3))
But the output is different from what I expect. The problem is that I can neither cbind nor merge the two data frames due to their different lengths. Any hints?

Does this work?
library(plyr)
df3 <- ddply(df2, .(g1, g2), summarize, max.val=max(val3), min.val=min(val3))
merged.df <- merge(df1, df3, by=c("g1", "g2"), all.x=TRUE)
## Now rescale merged.df$val2 as desired

Related

remove rows with duplicate values in any other adjacent column

How can i remove rows with any same value that is in another column of the same row? For example,
df<-structure(list(V1 = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L), V2 = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L,
2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), V3 = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L)), row.names = c(NA, -27L
), class = "data.frame")
##Top three rows
V1 V2 V3
1 1 1 1
2 2 1 1
3 3 1 1
4 1 2 1
5 2 2 1
6 3 2 1
7 1 3 1
8 2 3 1
In the following case (only showing 8 rows), I would remove every row accept rows 6 and 8 since they do not have any duplicate values in any column of the same row. I'm preferably looking for a data.table solution since I have a much larger dataframe.

You may use anyDuplicated for each row.
library(data.table)
setDT(df)
df[apply(df, 1, anyDuplicated) == 0]
# V1 V2 V3
#1: 3 2 1
#2: 2 3 1
#3: 3 1 2
#4: 1 3 2
#5: 2 1 3
#6: 1 2 3

An option using pairwise combn on the columns to check if there are equal values
df[!Reduce(`|`, combn(df, 2, FUN = function(x)
x[[1]] == x[[2]], simplify = FALSE))]
V1 V2 V3
1: 3 2 1
2: 2 3 1
3: 3 1 2
4: 1 3 2
5: 2 1 3
6: 1 2 3

Why heatmap row order is not the same as the data row order?

Hello for my work I creates a ggplot2 heatmap.
here are the data
structure(list(label = structure(c(4L, 5L, 7L, 2L, 3L, 6L, 8L,
1L, 4L, 5L, 7L, 2L, 3L, 6L, 8L, 1L, 4L, 5L, 7L, 2L, 3L, 6L, 8L,
1L), .Label = c("SP1", "SP2", "SP3", "SP4", "SP5", "SP6", "SP7",
"SP8"), class = "factor"), Categories = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), .Label = c("VAR1", "VAR2", "VAR3"), class = "factor"),
Value = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L,
3L, 3L, 3L, 2L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 1L), .Label = c("<NA>",
"A", "B"), class = "factor")), class = "data.frame", row.names = c(NA,
-24L))
as you can see there is a specific order of labels :
1 - SP4
2 - SP5
3 - SP7
...
7 - SP8
8 - SP1
here is the df :
> table
label Categories Value
1 SP4 VAR1 <NA>
2 SP5 VAR1 <NA>
3 SP7 VAR1 <NA>
4 SP2 VAR1 <NA>
5 SP3 VAR1 <NA>
6 SP6 VAR1 <NA>
7 SP8 VAR1 <NA>
8 SP1 VAR1 <NA>
9 SP4 VAR2 A
10 SP5 VAR2 B
11 SP7 VAR2 B
12 SP2 VAR2 B
13 SP3 VAR2 B
14 SP6 VAR2 A
15 SP8 VAR2 <NA>
16 SP1 VAR2 <NA>
17 SP4 VAR3 A
18 SP5 VAR3 B
19 SP7 VAR3 B
20 SP2 VAR3 B
21 SP3 VAR3 B
22 SP6 VAR3 B
23 SP8 VAR3 <NA>
24 SP1 VAR3 <NA>
so I'm expecting to see the same row order if I do a heatmap with ggplot, but instead I get this order:
I used the code :
ggplot(mod_mat_gen_env, aes(x=Categories, y=label))+
geom_tile(aes(fill=Value))

One option is to refactor it in your data.frame, or you can provide the order using limits= option in scale_y_discrete()
df$Value = replace(as.character(df$Value),df$Value=="<NA>",NA)
list_correct_order=c("SP4","SP5","SP7","SP2","SP3","SP6","SP8","SP1")
ggplot(df, aes(x=Categories, y=label))+
geom_tile(aes(fill=Value)) +
scale_y_discrete(limits=rev(list_correct_order))

how to select specific row by a column

I have a data, as an example I show below
a = rep(1:5, each=3)
b = rep(c("a","b","c","a","c"), each = 3)
df = data.frame(a,b)
I want to select all the rows that have the "a"
I tried to do it with
df[df$a %in% a,]
Can someone give me an idea how to get them out?
df2<- structure(list(V1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), V2 = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), .Label = c("B02", "B03",
"B04", "B05", "B06", "B07", "C02", "C03", "C04", "C05", "C06",
"C07"), class = "factor")), .Names = c("V1", "V2"), class = "data.frame", row.names = c(NA,
-24L))
I want to select specific rows that start with B but not all of them and just 02, 03, 04, 05
1 B02
1 B03
1 B04
1 B05
2 B02
2 B03
2 B04
2 B05
I also want to have the original data without them too

We need to check the 'b' column
df[df$b %in% 'a',]
For the updated question with 'df2', we can use paste to create the strings 'B02' to 'B05' and use %in% to subset
df2[df2$V2 %in% paste0("B0", 2:5),]
Or another option is grep
df2[grep("^B0[2-5]$", df2$V2),]

> df
a b
1 1 a
2 1 a
3 1 a
4 2 b
5 2 b
6 2 b
7 3 c
8 3 c
9 3 c
10 4 a
11 4 a
12 4 a
13 5 c
14 5 c
15 5 c
This basically says:
For all columns in df choose rows that have value equal to a
> rows_with_a<-df[df$b=='a', ]
> rows_with_a
a b
1 1 a
2 1 a
3 1 a
10 4 a
11 4 a
12 4 a

ggplot2 - how to create a clustered timeline?

How would you go about creating the graph below in R? I want to show the duration of different treatments for different patients.
Mock data here:
Start Day Stop Day
Patient 1 Drug 1 1 3
Drug 2 2 5
Drug 3 3 8
Patient 2 Drug 1 2 4
Drug 2 2 5
Drug 3 1 6
Patient 3 Drug 1 4 7
Drug 2 3 8
Drug 3 5 6

Your graph can be generated using geom_segment in the ggplot2 package:
df <- structure(list(Patient = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
3L, 3L, 3L), .Label = c("Patient1", "Patient2", "Patient3"), class = "factor"),
Drug = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("Drug1",
"Drug2", "Drug3"), class = "factor"), StartDay = c(1L, 2L,
3L, 2L, 2L, 1L, 4L, 3L, 5L), StopDay = c(3L, 5L, 8L, 4L,
5L, 6L, 7L, 8L, 6L)), .Names = c("Patient", "Drug", "StartDay",
"StopDay"), class = "data.frame", row.names = c(NA, -9L))
df$Drug <- factor(df$Drug, levels(df$Drug)[c(3,2,1)])
library(ggplot2)
ggplot(data=df, aes(color=Drug))+
geom_segment(aes(x=StartDay, xend=StopDay, y=Drug, yend=Drug),lwd=12)+
facet_grid(Patient~.)+xlab("Days")

Get sum of unique rows in table function in R

Suppose I have data which looks like this
Id Name Price sales Profit Month Category Mode Supplier
1 A 2 5 8 1 X K John
1 A 2 6 9 2 X K John
1 A 2 5 8 3 X K John
2 B 2 4 6 1 X L Sam
2 B 2 3 4 2 X L Sam
2 B 2 5 7 3 X L Sam
3 C 2 5 11 1 X M John
3 C 2 5 11 2 X L John
3 C 2 5 11 3 X K John
4 D 2 8 10 1 Y M John
4 D 2 8 10 2 Y K John
4 D 2 5 7 3 Y K John
5 E 2 5 9 1 Y M Sam
5 E 2 5 9 2 Y L Sam
5 E 2 5 9 3 Y M Sam
6 F 2 4 7 1 Z M Kyle
6 F 2 5 8 2 Z L Kyle
6 F 2 5 8 3 Z M Kyle
if I apply table function, it will just combines are the rows and result will be
K L M
X 4 4 1
Y 2 1 3
Z 0 1 2
Now what if I want not the sum of all rows but only sum of those rows with Unique Id
so it looks like
K L M
X 2 2 1
Y 1 1 2
Z 0 1 1
Thanks

If df is your data.frame:
# Subset original data.frame to keep columns of interest
df1 <- df[,c("Id", "Category", "Mode")]
# Remove duplicated rows
df1 <- df1[!duplicated(df1),]
# Create table
with(df1, table(Category, Mode))
# Mode
# Category K L M
# X 2 2 1
# Y 1 1 2
# Z 0 1 1
Or in one line using unique
table(unique(df[c("Id", "Category", "Mode")])[-1])
df <- structure(list(Id = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L), Name = structure(c(1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L), .Label = c("A",
"B", "C", "D", "E", "F"), class = "factor"), Price = c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), sales = c(5L, 6L, 5L, 4L, 3L, 5L, 5L, 5L, 5L, 8L, 8L, 5L,
5L, 5L, 5L, 4L, 5L, 5L), Profit = c(8L, 9L, 8L, 6L, 4L, 7L, 11L,
11L, 11L, 10L, 10L, 7L, 9L, 9L, 9L, 7L, 8L, 8L), Month = c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L), Category = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("X", "Y", "Z"
), class = "factor"), Mode = structure(c(1L, 1L, 1L, 2L, 2L,
2L, 3L, 2L, 1L, 3L, 1L, 1L, 3L, 2L, 3L, 3L, 2L, 3L), .Label = c("K",
"L", "M"), class = "factor"), Supplier = structure(c(1L, 1L,
1L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L
), .Label = c("John", "Kyle", "Sam"), class = "factor")), .Names = c("Id",
"Name", "Price", "sales", "Profit", "Month", "Category", "Mode",
"Supplier"), class = "data.frame", row.names = c(NA, -18L))

We can try
library(data.table)
dcast(unique(setDT(df1[c('Category', 'Mode', 'Id')])),
Category~Mode, value.var='Id', length)
# Category K L M
#1: X 2 2 1
#2: Y 1 1 2
#3: Z 0 1 1
Or with dplyr
library(dplyr)
df1 %>%
distinct(Id, Category, Mode) %>%
group_by(Category, Mode) %>%
tally() %>%
spread(Mode, n, fill=0)
# Category K L M
# (chr) (dbl) (dbl) (dbl)
#1 X 2 2 1
#2 Y 1 1 2
#3 Z 0 1 1
Or as #David Arenburg suggested, a variant of the above is
df1 %>%
distinct(Id, Category, Mode) %>%
select(Category, Mode) %>%
table()

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Rescaling by group across data frames - r

Does this work? library(plyr) df3 <- ddply(df2, .(g1, g2), summarize, max.val=max(val3), min.val=min(val3)) merged.df <- merge(df1, df3, by=c("g1", "g2"), all.x=TRUE) ## Now rescale merged.df$val2 as desired

Related

remove rows with duplicate values in any other adjacent column

Why heatmap row order is not the same as the data row order?

how to select specific row by a column

ggplot2 - how to create a clustered timeline?

Get sum of unique rows in table function in R

Categories

Resources