Related
I have the below df:
df<-data.frame(geokey=c("A","A","A","A","A","A","B","B","B","B","B","B"),
upc=c("100","100","101","101","102","102","200","200","201","201",
"202","202"),
endwk=c("14-07-2021","21-07-2021","14-07-2021","21-07-2021","14-07-2021","21-07-2021",
"14-07-2021","21-07-2021","14-07-2021","21-07-2021","14-07-2021","21-07-2021"),
Base_units=c(2,3,1,2,4,1,1,4,2,3,3,2),
Base_price=c(0.1,0.2,0.2,0.1,0.1,0.1,0.2,0.3,0.4,0.1,0.2,0.3),
Incr_units=c(2,1,1,1,2,1,3,2,2,3,1,1),
incr_price=c(0.1,0.1,0.1,0.3,0.2,0.1,0.1,0.2,0.1,0.2,0.1,0.2))
> df
geokey upc endwk Base_units Base_price Incr_units incr_price
1 A 100 14-07-2021 2 0.1 2 0.1
2 A 100 21-07-2021 3 0.2 1 0.1
3 A 101 14-07-2021 1 0.2 1 0.1
4 A 101 21-07-2021 2 0.1 1 0.3
5 A 102 14-07-2021 4 0.1 2 0.2
6 A 102 21-07-2021 1 0.1 1 0.1
7 B 200 14-07-2021 1 0.2 3 0.1
8 B 200 21-07-2021 4 0.3 2 0.2
9 B 201 14-07-2021 2 0.4 2 0.1
10 B 201 21-07-2021 3 0.1 3 0.2
11 B 202 14-07-2021 3 0.2 1 0.1
12 B 202 21-07-2021 2 0.3 1 0.2
expected output---> Group by geokey--upc---endwk with all vol cols to be totalled (added) and price columns to be averaged shown as below:
df_merged<-data.frame(geokey=c("A","A","B","B"),
upc=c("upc_100_101_102","upc_100_101_102","upc_200_201_202","upc_200_201_202"),
endwk=c("14-07-2021","21-07-2021","14-07-2021","21-07-2021"),
Base_units_totalled=c(7,6,6,9),
Base_price_averaged=c(0.133,0.133,0.2667,0.2333),
Incr_units_totalled=c(5,3,3,6),
incr_price_averaged=c(0.1333,0.1,0.1,0.2))
> df_merged
geokey upc endwk Base_units_totalled Base_price_averaged Incr_units_totalled incr_price_averaged
1 A upc_100_101_102 14-07-2021 7 0.1330 5 0.1333
2 A upc_100_101_102 21-07-2021 6 0.1330 3 0.1000
3 B upc_200_201_202 14-07-2021 6 0.2667 3 0.1000
4 B upc_200_201_202 21-07-2021 9 0.2333 6 0.2000
Help will be appreciated.
I presume you want to summarize the upc column and not group by it?
library(dplyr)
group_by(geokey, endwk) %>%
summarize(upc = paste0("upc_", paste(upc, collapse = "_")),
across(contains("units"), sum),
across(contains("price"), mean), .groups = "drop")
# A tibble: 4 x 7
geokey endwk upc Base_units Incr_units Base_price incr_price
* <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 A 14-07-2021 upc_100_101_102 7 5 0.133 0.133
2 A 21-07-2021 upc_100_101_102 6 3 0.133 0.167
3 B 14-07-2021 upc_200_201_202 6 6 0.267 0.1
4 B 21-07-2021 upc_200_201_202 9 6 0.233 0.2
I need to calculate a New value based on Value2 and the newly created previous value of the same New column.
But additionally I want to do this per Group.
I made a for loop previously but it stopped working when i added a second loop for the groups.
for(j in 1:(*EACHGROUP*){
for(i in 2:nrow(DAT) #Start at second place in each Group
DAT$New[i] <- ( DAT$New[i-1]*10^(DAT$Value2[i]) )}
Dummy data
Year <- c(1980,1990,2000,2005,1993,2008,1999,2003,2005)
Group <- c("A","A","A","A","B","B","C","C","C")
Value2 <- c(0,0.25,0.1,-0.3,.5,0.7,-0.8,0.01,0.2)
New <- c(1,1,1,1,1,1,1,1,1)
DAT <- data.frame(cbind(Year,Group,Value2,New))
Output:
Year Group Value2 New
1 1980 A 0 1
2 1990 A 0.25 1
3 2000 A 0.1 1
4 2005 A -0.3 1
5 1993 B 0.5 1
6 2008 B 0.7 1
7 1999 C -0.8 1
8 2003 C 0.01 1
9 2005 C 0.2 1
How can I continue with this approach?
Or should I use "dplyr" for example to do this more easily?
Desired result
Year Group Value2 New
1 1980 A 0 1
2 1990 A 0.25 1.78
3 2000 A 0.1 2.24
4 2005 A -0.3 1.12
5 1993 B 0.5 1
6 2008 B 0.7 5.01
7 1999 C -0.8 1
8 2003 C 0.01 1.02
9 2005 C 0.2 1.62
Best regards
Here is a dplyr way, with an auxiliary function fun.
fun <- function(x, y){
for(i in seq_along(x)[-1]){
x[i] <- x[i - 1] * 10^y[i]
}
x
}
DAT %>%
group_by(Group) %>%
mutate(New = fun(New, Value2))
## A tibble: 9 x 4
## Groups: Group [3]
# Year Group Value2 New
# <dbl> <chr> <dbl> <dbl>
#1 1980 A 0 1
#2 1990 A 0.25 1.78
#3 2000 A 0.1 2.24
#4 2005 A -0.3 1.12
#5 1993 B 0.5 1
#6 2008 B 0.7 5.01
#7 1999 C -0.8 1
#8 2003 C 0.01 1.02
#9 2005 C 0.2 1.62
This question already has answers here:
Reshaping data.frame from wide to long format
(8 answers)
Closed 2 years ago.
I have a dataframe with columns id, price1, price2, price3,prob1,prob2,prob3 I want to covnert the wide format and price, prob columns into long format
library(dplyr)
library(data.table)
a <- data.table("id" = c(1,2,4),
"price1"=c(1.2,2.44,5.6),
"price2"=c(7.6,8,65),
"price3"=c(1.2,4.5,7.8),
"prob1"=c(0.1,0.3,0.5),
"prob2"=c(0.3,0.35,0.75),
"prob3"=c(0.18,0.31,0.58))
> a
id price1 price2 price3 prob1 prob2 prob3
1 1 1.20 7.6 1.2 0.1 0.30 0.18
2 2 2.44 8.0 4.5 0.3 0.35 0.31
3 4 5.60 65.0 7.8 0.5 0.75 0.58
I want to transform the table a as
b <- data.table("id"=c(1,1,1,2,2,2,3,3,3),
"order"=c(1,2,3,1,2,3,1,2,3),
"price"=c(1.20,7.6,1.2,2.44,8.0,4.5,5.60,65.0,7.8),
"prob"=c(0.1,0.30,0.18,0.3,0.35,0.31,0.5,0.75,0.58))
> b
id order price prob
1: 1 1 1.20 0.10
2: 1 2 7.60 0.30
3: 1 3 1.20 0.18
4: 2 1 2.44 0.30
5: 2 2 8.00 0.35
6: 2 3 4.50 0.31
7: 3 1 5.60 0.50
8: 3 2 65.00 0.75
9: 3 3 7.80 0.58
here order is indicating the sequence number of price and prob values, else it would get shuffled.
I want to get this transformation in sparklyr
You can use pivot_longer specifying names_pattern.
tidyr::pivot_longer(a, cols = -id,
names_to = c('.value', 'order'),
names_pattern = '(.*?)(\\d+)')
# A tibble: 9 x 4
# id order price prob
# <dbl> <chr> <dbl> <dbl>
#1 1 1 1.2 0.1
#2 1 2 7.6 0.3
#3 1 3 1.2 0.18
#4 2 1 2.44 0.3
#5 2 2 8 0.35
#6 2 3 4.5 0.31
#7 4 1 5.6 0.5
#8 4 2 65 0.75
#9 4 3 7.8 0.580
I have a data frame ‘true set’, that I would like to sort based on the order of values in vectors ‘order’.
true_set <- data.frame(dose1=c(rep(1,5),rep(2,5),rep(3,5)), dose2=c(rep(1:5,3)),toxicity=c(0.05,0.1,0.15,0.3,0.45,0.1,0.15,0.3,0.45,0.55,0.15,0.3,0.45,0.55,0.6),efficacy=c(0.2,0.3,0.4,0.5,0.6,0.4,0.5,0.6,0.7,0.8,0.5,0.6,0.7,0.8,0.9),d=c(1:15))
orders<-matrix(nrow=3,ncol=15)
orders[1,]<-c(1,2,6,3,7,11,4,8,12,5,9,13,10,14,15)
orders[2,]<-c(1,6,2,3,7,11,12,8,4,5,9,13,14,10,15)
orders[3,]<-c(1,6,2,11,7,3,12,8,4,13,9,5,14,10,15)
The expected result would be:
First orders[1,] :
dose1 dose2 toxicity efficacy d
1 1 1 0.05 0.2 1
2 1 2 0.10 0.3 2
3 2 1 0.10 0.4 6
4 1 3 0.15 0.4 3
5 2 2 0.15 0.5 7
6 3 1 0.15 0.5 11
7 1 4 0.30 0.5 4
8 2 3 0.30 0.6 8
9 3 2 0.30 0.6 12
10 1 5 0.45 0.6 5
11 2 4 0.45 0.7 9
12 3 3 0.45 0.7 13
13 2 5 0.55 0.8 10
14 3 4 0.55 0.8 14
15 3 5 0.60 0.9 15
First orders[2,] : as above
First orders[3,] : as above
true_set <- data.frame(dose1=c(rep(1,5),rep(2,5),rep(3,5)), dose2=c(rep(1:5,3)),toxicity=c(0.05,0.1,0.15,0.3,0.45,0.1,0.15,0.3,0.45,0.55,0.15,0.3,0.45,0.55,0.6),efficacy=c(0.2,0.3,0.4,0.5,0.6,0.4,0.5,0.6,0.7,0.8,0.5,0.6,0.7,0.8,0.9),d=c(1:15))
orders<-matrix(nrow=3,ncol=15)
orders[1,]<-c(1,2,6,3,7,11,4,8,12,5,9,13,10,14,15)
orders[2,]<-c(1,6,2,3,7,11,12,8,4,5,9,13,14,10,15)
orders[3,]<-c(1,6,2,11,7,3,12,8,4,13,9,5,14,10,15)
# Specify your order set in the row dimension
First_order <- true_set[orders[1,],]
Second_order <- true_Set[orders[2,],]
Third_order <- true_Set[orders[3,],]
# If you want to store all orders in a list, you can try the command below:
First_orders <- list(First_Order=true_set[orders[1,],],Second_Order=true_set[orders[2,],],Third_Order=true_set[orders[3,],])
First_orders[1] # OR First_orders$First_Order
First_orders[2] # OR First_orders$Second_Order
First_orders[3] # OR First_orders$Third_Order
# If you want to combine the orders column wise, try the command below:
First_orders <- cbind(First_Order=true_set[orders[1,],],Second_Order=true_set[orders[2,],],Third_Order=true_set[orders[3,],])
# If you want to combine the orders row wise, try the command below:
First_orders <- rbind(First_Order=true_set[orders[1,],],Second_Order=true_set[orders[2,],],Third_Order=true_set[orders[3,],])
Consider this data:
m = data.frame(pop=c(1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4),
id=c(0,1,1,1,1,1,0,2,1,1,1,2,1,2,2,2))
> m
pop id
1 1 0
2 1 1
3 1 1
4 1 1
5 2 1
6 2 1
7 2 0
8 2 2
9 2 1
10 3 1
11 3 1
12 3 2
13 3 1
14 3 2
15 4 2
16 4 2
I would like to get the frequency of each unique id in each unique pop? For example, the id 1 is present 3 times out of 4 when pop == 1, therefore the frequency of id 1 in pop 1 is 0.75.
I came up with this ugly solution:
out = matrix(0,ncol=3)
for (p in unique(m$pop))
{
for (i in unique(m$id))
{
m1 = m[m$pop == p,]
f = nrow(m1[m1$id == i,])/nrow(m1)
out = rbind(out, c(p, f, i))
}
}
out = out[-1,]
colnames(out) = c("pop", "freq", "id")
# SOLUTION
> out
pop freq id
[1,] 1 0.25 0
[2,] 1 0.75 1
[3,] 1 0.00 2
[4,] 2 0.20 0
[5,] 2 0.60 1
[6,] 2 0.20 2
[7,] 3 0.00 0
[8,] 3 0.60 1
[9,] 3 0.40 2
[10,] 4 0.00 0
[11,] 4 0.00 1
[12,] 4 1.00 2
I am sure there exists a more efficient solution using data.table or table but couldn't find it.
Here's what I might do:
as.data.frame(prop.table(table(m),1))
# pop id Freq
# 1 1 0 0.25
# 2 2 0 0.20
# 3 3 0 0.00
# 4 4 0 0.00
# 5 1 1 0.75
# 6 2 1 0.60
# 7 3 1 0.60
# 8 4 1 0.00
# 9 1 2 0.00
# 10 2 2 0.20
# 11 3 2 0.40
# 12 4 2 1.00
If you want it sorted by pop, you can do that afterwards. Alternately, you could transpose the table with t before converting to data.frame; or use rev(m) and prop.table on dimension 2.
Try:
library(dplyr)
m %>%
group_by(pop, id) %>%
summarise(s = n()) %>%
mutate(freq = s / sum(s)) %>%
select(-s)
Which gives:
#Source: local data frame [8 x 3]
#Groups: pop
#
# pop id freq
#1 1 0 0.25
#2 1 1 0.75
#3 2 0 0.20
#4 2 1 0.60
#5 2 2 0.20
#6 3 1 0.60
#7 3 2 0.40
#8 4 2 1.00
A data.table solution:
setDT(m)[, {div = .N; .SD[, .N/div, keyby = id]}, by = pop]
# pop id V1
#1: 1 0 0.25
#2: 1 1 0.75
#3: 2 0 0.20
#4: 2 1 0.60
#5: 2 2 0.20
#6: 3 1 0.60
#7: 3 2 0.40
#8: 4 2 1.00