how to find a row with minimum changes - r

I have a data set like this
df <- structure(list(Idm = c("AAA", "AAA", "AAA", "AAAA", "AAAA", "AAAA",
"AAAAA", "AAAAA", "AAAAA", "BB", "BB", "BB", "BBB", "BBB", "BBB",
"BBBBB", "BBBBB", "BBBBB", "CCCC", "CCCC", "CCCC", "CCCCC", "CCCCC",
"CCCCC"), name = c("G", "A", "B", "G", "A", "B", "G", "A", "B",
"G", "A", "B", "G", "A", "B", "G", "A", "B", "G", "A", "B", "G",
"A", "B"), value = c(2506.3, 5306.7, 6558.1, 2270.1, 5449.3,
5790.2, 334.1, 947, 1128.2, 809, 1944, 2539, 1302.3, 3447, 4107.7,
2562.7, 5127.6, 4585.8, 911, 5121.9, 6313.4, 832.8, 1230.2, 1180.8
), sd = c(1865.19913950227, 2221.04246770145, 5885.17898538354,
1273.08845332915, 2008.35456364989, 3037.90616433973, 181.270083944741,
446.8334626383, 490.805504587442, 633.895459309604, 961.277571776227,
2444.30575487874, 1012.39068051815, 1393.79545127684, 5826.31668323421,
1476.91924739755, 1508.60484223007, 4258.95203228838, 838.051710815031,
2911.84582696268, 4510.54727758543, 507.433227134369, 562.122249455875,
1674.86096835926), n = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L)), row.names = c(NA,
-24L), groups = structure(list(Idm = c("AAA", "AAAA", "AAAAA",
"BB", "BBB", "BBBBB", "CCCC", "CCCCC"), .rows = structure(list(
1:3, 4:6, 7:9, 10:12, 13:15, 16:18, 19:21, 22:24), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
I want to know which one of these Idm has the Lowest variation for instance
AAAAA G 334.1 181.2700839
AAAAA A 947 446.8334626
AAAAA B 1128.2 490.8055046
and this one are having the less variation across 3 repeat
CCCCC G 832.8 507.4332271 1
CCCCC A 1230.2 562.1222495 2
CCCCC B 1180.8 1674.860968 3
I can see that by plot as follows
I am thinking of getting the average of each 3 replicate (value) and each three (sd) and the one with a lower of both value and sd is the one with the lowest variation

Clarification is needed on the question, but if you just want the rows with the minimum standard deviation by group then you can use dplyr.
library(dplyr)
df %>%
dplyr::group_by(Idm) %>%
slice(which.min(sd)) %>%
arrange(sd)
Output
# A tibble: 8 × 5
# Groups: Idm [8]
Idm name value sd n
<chr> <chr> <dbl> <dbl> <int>
1 AAAAA G 334. 181. 1
2 CCCCC G 833. 507. 1
3 BB G 809 634. 1
4 CCCC G 911 838. 1
5 BBB G 1302. 1012. 1
6 AAAA G 2270. 1273. 1
7 BBBBB G 2563. 1477. 1
8 AAA G 2506. 1865. 1
Or you can sort each group and retain all data:
df %>%
dplyr::group_by(Idm) %>%
arrange(Idm, sd)
Output
# A tibble: 24 × 5
# Groups: Idm [8]
Idm name value sd n
<chr> <chr> <dbl> <dbl> <int>
1 AAAAA G 334. 181. 1
2 AAAAA A 947 447. 2
3 AAAAA B 1128. 491. 3
4 CCCCC G 833. 507. 1
5 CCCCC A 1230. 562. 2
6 BB G 809 634. 1
7 CCCC G 911 838. 1
8 BB A 1944 961. 2
9 BBB G 1302. 1012. 1
10 AAAA G 2270. 1273. 1
# … with 14 more rows

Related

Add column with number of specific values within group in dataframe

I have a dataframe:
ID SUB_ID Action
1 A Open
1 A Download
1 A Close
1 B Open
1 B Search
1 B Download
1 B Close
2 AA Open
2 AA Download
2 AA Close
2 BB Open
2 BB Search
2 BB Filter
2 BB Close
3 C Open
3 C Search
3 C Filter
3 C Close
I want to get table with ID and number of SUB_ID per ID and number of "Download" in column Action within one SUB_ID. So, desired result is:
ID SUB_ID_n Download_n
1 2 2
2 2 1
3 1 0
How could i do that?
Count number of unique values using n_distinct and sum over logical values to calculate rows with Action == 'Download'.
library(dplyr)
df %>%
group_by(ID) %>%
summarise(SUB_ID_n = n_distinct(SUB_ID, na.rm = TRUE),
Download_n = sum(Action == 'Download'))
# ID SUB_ID_n Download_n
# <int> <int> <int>
#1 1 2 2
#2 2 2 1
#3 3 1 0
In data.table this can be written as :
library(data.table)
setDT(df)[, .(SUB_ID_n = uniqueN(SUB_ID, na.rm = TRUE),
Download_n = sum(Action == 'Download')), ID]
data
df <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L), SUB_ID = c("A", "A", "A", "B",
"B", "B", "B", "AA", "AA", "AA", "BB", "BB", "BB", "BB", "C",
"C", "C", "C"), Action = c("Open", "Download", "Close", "Open",
"Search", "Download", "Close", "Open", "Download", "Close", "Open",
"Search", "Filter", "Close", "Open", "Search", "Filter", "Close"
)), class = "data.frame", row.names = c(NA, -18L))

Create a dataframe with list elements with dplyr in R

This is my dataframe:
df<-list(structure(list(Col1 = structure(1:6, .Label = c("A", "B",
"C", "D", "E", "F"), class = "factor"), Col2 = structure(c(1L,
2L, 3L, 2L, 4L, 5L), .Label = c("B", "C", "D", "F", "G"), class = "factor")), class = "data.frame", row.names = c(NA,
-6L)), structure(list(Col1 = structure(c(1L, 4L, 5L, 6L, 2L,
3L), .Label = c("A", "E", "H", "M", "N", "P"), class = "factor"),
Col2 = structure(c(1L, 2L, 3L, 2L, 4L, 5L), .Label = c("B",
"C", "D", "F", "G"), class = "factor")), class = "data.frame", row.names = c(NA,
-6L)), structure(list(Col1 = structure(c(1L, 4L, 6L, 5L, 2L,
3L), .Label = c("A", "W", "H", "M", "T", "U"), class = "factor"),
Col2 = structure(c(1L, 2L, 3L, 2L, 4L, 5L), .Label = c("B",
"C", "D", "S", "G"), class = "factor")), class = "data.frame", row.names = c(NA,
-6L)))
I want to extract col1=df[[1]][1] as a dataframe. Then col1 of the second position of this list I want to merge to the df[[1]][1], then I will have a dataframe with 2 columns.
After this I want to merge the column 1 of the third position of the list to the dataframe with two columns, then I will have a dataframe with 3 columns.
In other words my dataframe should have 3 columns, all the first columns of each entry of my list.
The dplyr package can helpme to do this?
Any help?
You can use lapply to extract the three columns named "Col1 in one go. Then set the names of the result.
col1 <- as.data.frame(lapply(df, '[[', "Col1"))
names(col1) <- letters[seq_along(col1)]
col1
# a b c
#1 A A A
#2 B M M
#3 C N U
#4 D P T
#5 E E W
#6 F H H
Choose any other column names that you might find better.
A dplyr way could be
df %>%
unlist(recursive = FALSE) %>%
as.data.frame %>%
select(., starts_with("Col1"))
# Col1 Col1.1 Col1.2
#1 A A A
#2 B M M
#3 C N U
#4 D P T
#5 E E W
#6 F H H
With map_dfc from purrr:
library(purrr)
map_dfc(df, `[`, 1)
Output:
Col1 Col11 Col12
1 A A A
2 B M M
3 C N U
4 D P T
5 E E W
6 F H H
Alternative use of map_dfc making use of purrr's concise element extraction syntax that allows specifying elements of elements by name or position. The first is, for example, equivalent to
map_dfc(df, `[[`, 1)
which differs from the use of [ in that the columns will not be named variations of Col1 and just get V names instead, which may be desirable since names like Col11 and Col12 may be confusing.
df <- list(structure(list(Col1 = structure(1:6, .Label = c("A", "B", "C", "D", "E", "F"), class = "factor"), Col2 = structure(c(1L, 2L, 3L, 2L, 4L, 5L), .Label = c("B", "C", "D", "F", "G"), class = "factor")), class = "data.frame", row.names = c(NA, -6L)), structure(list(Col1 = structure(c(1L, 4L, 5L, 6L, 2L, 3L), .Label = c("A", "E", "H", "M", "N", "P"), class = "factor"), Col2 = structure(c(1L, 2L, 3L, 2L, 4L, 5L), .Label = c("B", "C", "D", "F", "G"), class = "factor")), class = "data.frame", row.names = c(NA, -6L)), structure(list(Col1 = structure(c(1L, 4L, 6L, 5L, 2L, 3L), .Label = c("A", "W", "H", "M", "T", "U"), class = "factor"), Col2 = structure(c(1L, 2L, 3L, 2L, 4L, 5L), .Label = c("B", "C", "D", "S", "G"), class = "factor")), class = "data.frame", row.names = c(NA, -6L)))
library(purrr)
map_dfc(df, 1)
#> # A tibble: 6 x 3
#> V1 V2 V3
#> <fct> <fct> <fct>
#> 1 A A A
#> 2 B M M
#> 3 C N U
#> 4 D P T
#> 5 E E W
#> 6 F H H
map_dfc(df, "Col1")
#> # A tibble: 6 x 3
#> V1 V2 V3
#> <fct> <fct> <fct>
#> 1 A A A
#> 2 B M M
#> 3 C N U
#> 4 D P T
#> 5 E E W
#> 6 F H H
Created on 2018-09-19 by the reprex package (v0.2.0).
res<-1:nrow(df[[1]][1])
for(i in 1:length(df)){
print ( as.vector(df[[i]][1]))
res<-cbind(res,as.data.frame(df[[i]][1]))
}
res$res<-NULL
So, the output is:
Col1 Col1 Col1
1 A A A
2 B M M
3 C N U
4 D P T
5 E E W
6 F H H
Using dplyr
library(dplyr)
df %>%
sapply('[[',1) %>%
as.data.frame
#returns
V1 V2 V3
1 A A A
2 B M M
3 C N U
4 D P T
5 E E W
6 F H H

Compute mean pairwise covariance between elements in a list

I have the following data frames:
# df1
id cg_v
1 a
2 b
3 a b
4 b c
5 b c d
6 d
# df2
id cg
1 a
2 b
3 a
3 b
4 b
4 c
5 b
5 c
5 d
6 d
I need to add a column to df1 that contains the mean covariance computed across each pair of elements in cg_v. If cg_v contains only one element, then I would like the new column to contain its variance.
I can get a covariance matrix by cov(crossprod(table(df2)))
# a b c d
a 0.9166667 0.0000000 -0.5833333 -0.6666667
b 0.0000000 2.0000000 1.0000000 0.0000000
c -0.5833333 1.0000000 0.9166667 0.3333333
d -0.6666667 0.0000000 0.3333333 0.6666667
What do I do from here?
The end result should be like this:
# df1
id cg_v cg_cov
1 a 0.9166667
2 b 2.0000000
3 a b 0.0000000
4 b c 1.0000000
5 b c d 0.4444444 # This is equal to (1.0000000 + 0.3333337 + 0.0000000)/3
6 d 0.6666667
Code to generate df1 and df2:
df1 <- structure(list(id = c(1L, 2L, 3L, 4L, 5L, 6L),
cg_v = c("a", "b", "a b", "b c", "b c d", "d")),
.Names = c("id", "cg_v"),
class = "data.frame", row.names = c(NA, -6L))
df2 <- structure(list(id = c(1L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 6L),
cg = c("a", "b", "a", "b", "b", "c", "b", "c", "d", "d")),
.Names = c("id", "cg"),
class = "data.frame", row.names = c(NA, -10L))
I think I found a solution for this problem using data.tables and reshape. What do you want to do with the three letters b c d? I assumed that you want to have the covariance of the first two letters:
require(reshape)
require(data.table)
dt1 <- data.table(id = c(1L, 2L, 3L, 4L, 5L, 6L),
cg_v = c("a", "b", "a b", "b c", "b c d", "d"))
dt2 <- data.table(id = c(1L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 6L),
cg = c("a", "b", "a", "b", "b", "c", "b", "c", "d", "d"))
cov_dt <- data.table(melt(cov(crossprod(table(df2)))))
dt1 <- cbind(dt1, t(sapply(strsplit(as.character(df1$cg_v), " "), function(x)x[1:2])))
#replace the na with the first colomn
dt1[is.na(V2), V2 := V1]
# Merge them on two columns
setkey(dt1, "V1", "V2")
setkey(cov_dt, "X1", "X2")
result <- cov_dt[dt1]
> result[,.(id, cg_v, value)]
id cg_v value
1: 1 a 0.9166667
2: 3 a b 0.0000000
3: 2 b 2.0000000
4: 4 b c 1.0000000
5: 5 b c d 1.0000000
6: 6 d 0.6666667
Variant which also works if there are more than 2 letters (not the most efficient code):
require(reshape)
require(combinat)
df1 <- data.frame(id = c(1L, 2L, 3L, 4L, 5L, 6L),
cg_v = c("a", "b", "a b", "b c", "b c d", "d"))
df2 <- data.frame(id = c(1L, 2L, 3L, 3L, 4L, 4L, 5L, 5L, 5L, 6L),
cg = c("a", "b", "a", "b", "b", "c", "b", "c", "d", "d"))
cov_dt <- cov(crossprod(table(df2)))
mat <- sapply(strsplit(as.character(df1$cg_v), " "), function(x) if(length(x) == 1){c(x,x)} else(x))
# Should be all minimal 2
sapply(mat, length) > 1
mat <- sapply(mat, function(x) matrix(combn(x,2), nrow = 2))
df1$cg_cov <- sapply(mat, function(x) mean(apply(x,2, function(x) cov_dt[x[1],x[2]])))
> df1
id cg_v cg_cov
1 1 a 0.9166667
2 2 b 2.0000000
3 3 a b 0.0000000
4 4 b c 1.0000000
5 5 b c d 0.4444444
6 6 d 0.6666667

Subset and compare on two columns

I have a table made up of three columns. A person identifier, a column of event type (A or B) and a column of dates when the event happened.
This is shown here:
Person Event EventDate
1 A 22/03/15
1 A 22/05/15
1 B 12/12/15
1 B 12/12/15
2 B 01/13/12
2 B 02/03/12
2 B 03/08/14
2 A 05/09/14
3 B 02/02/02
3 A 03/05/14
3 B 03/06/14
3 A 17/11/15
3 A 17/02/16
3 A 18/05/16
3 A 23/06/16
I want to subset the data. The subset should capture all eventA within a Person that occur after the first eventB The output would be:
Person Event EventDate
2 A 05/09/14
3 A 03/05/14
3 A 17/11/15
3 A 17/02/16
3 A 18/05/16
3 A 23/06/16
I think the problem I have is knowing how to compare rows for a Person based on two column comparison (Event and EventDate).
This is the dput of the original data above
structure(list(Person..Event...EventDate = c("1 A 22/03/15",
"1 A 22/05/15", "1 B 12/12/15", "1 B 12/12/15", "2 B 01/13/12",
"2 B 02/03/12", "2 B 03/08/14", "2 A 05/09/14", "3 B 02/02/02",
"3 A 03/05/14", "3 B 03/06/14", "3 A 17/11/15", "3 A 17/02/16",
"3 A 18/05/16", "3 A 23/06/16")), .Names = "Person..Event...EventDate", class = "data.frame", row.names = c(NA,
-15L))
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)). Grouped by 'Person', we order the 'Person', 'EventDate' (after converting to Date class), then if the cumulative sum of 'B' 'Event' is 1 for the first elements, then get the row index of 'A' 'Event' and use that to subset the original dataset.
library(data.table)
setDT(df1)[df1[order(Person, as.Date(EventDate, '%d/%m/%y')),
if(cumsum(Event == "B")[1]==1) .I[Event == "A"], by = Person]$V1]
# Person Event EventDate
#1: 2 A 05/09/14
#2: 3 A 03/05/14
#3: 3 A 17/11/15
#4: 3 A 17/02/16
#5: 3 A 18/05/16
#6: 3 A 23/06/16
Or we can use dplyr
library(dplyr)
df1 %>%
arrange(Person, as.Date(EventDate, '%d/%m/%y')) %>%
group_by(Person) %>%
filter(first(Event == "B") & Event == "A")
# Person Event EventDate
# <int> <chr> <chr>
#1 2 A 05/09/14
#2 3 A 03/05/14
#3 3 A 17/11/15
#4 3 A 17/02/16
#5 3 A 18/05/16
#6 3 A 23/06/16
data
df1 <- structure(list(Person = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), Event = c("A", "A", "B", "B", "B", "B",
"B", "A", "B", "A", "B", "A", "A", "A", "A"), EventDate = c("22/03/15",
"22/05/15", "12/12/15", "12/12/15", "01/13/12", "02/03/12", "03/08/14",
"05/09/14", "02/02/02", "03/05/14", "03/06/14", "17/11/15", "17/02/16",
"18/05/16", "23/06/16")), .Names = c("Person", "Event", "EventDate"
), class = "data.frame", row.names = c(NA, -15L))
This can be done using sqldf. I'm assuming the data is sorted by date.
> library(sqldf)
>
v1 <- structure(list(Person = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), Event = c("A", "A", "B", "B", "B", "B", "B", "A", "B", "A",
"B", "A", "A", "A", "A"), EventDate = c("22/03/15", "22/05/15", "12/12/15",
"12/12/15", "01/10/12", "02/03/12", "03/08/14", "05/09/14", "02/02/02",
"03/05/14", "03/06/14", "17/11/15", "17/02/16", "18/05/16", "23/06/16")), .Names
= c("Person", "Event", "EventDate"), class = "data.frame", row.names = c(NA,
-15L))
> v1$EventDate <- as.Date(v1$EventDate , '%d/%m/%y')
> v2 <- v1[v1$Event == 'B' , ]
> v2 <- v2[ !duplicated(v2$Person) , ]
> v3 <- v1[v1$Event == 'A' , ]
> sqldf("select a.* from v3 a , v2 b where a.EventDate > b.EventDate And a.Person = b.Person")
Person Event EventDate
1 2 A 2014-09-05
2 3 A 2014-05-03
3 3 A 2015-11-17
4 3 A 2016-02-17
5 3 A 2016-05-18
6 3 A 2016-06-23
> v1
Person Event EventDate
1 1 A 2015-03-22
2 1 A 2015-05-22
3 1 B 2015-12-12
4 1 B 2015-12-12
5 2 B 2012-10-01
6 2 B 2012-03-02
7 2 B 2014-08-03
8 2 A 2014-09-05
9 3 B 2002-02-02
10 3 A 2014-05-03
11 3 B 2014-06-03
12 3 A 2015-11-17
13 3 A 2016-02-17
14 3 A 2016-05-18
15 3 A 2016-06-23

Multiply a table(file1) with individual cells of a column(file2) using R

File 1:Ele A B C DEs 1 2 3 4Ep 2 4 3 4Ek 1 9 3 8File2:A 1 B 2 C 3 D 5
Need is to ensure that each element under Column A (file 1) gets multiplied by the value assigned to A in file 2 (and so on). I know matrix multiplication in R but this is not the case of matrix multiplication I suppose. Help would be greatly appreciated. Thanks
You could try
indx <- df2$Col1
df1[indx]*df2$Col2[col(df1[indx])]
# A B C D
#1 1 4 9 20
#2 2 8 9 20
#3 1 18 9 40
Or you could use sweep
sweep(df1[indx], 2, df2$Col2, '*')
# A B C D
#1 1 4 9 20
#2 2 8 9 20
#3 1 18 9 40
data
df1 <- structure(list(Ele = c("Es", "Ep", "Ek"), A = c(1L, 2L, 1L),
B = c(2L, 4L, 9L), C = c(3L, 3L, 3L), D = c(4L, 4L, 8L)),
.Names = c("Ele", "A", "B", "C", "D"), class = "data.frame",
row.names = c(NA, -3L))
df2 <- structure(list(Col1 = c("A", "B", "C", "D"), Col2 = c(1L, 2L,
3L, 5L)), .Names = c("Col1", "Col2"), class = "data.frame",
row.names = c(NA, -4L))

Categories

Resources