Related
I have data from several hundred participants who each provided between 1 and 6 sentences. They then rated their sentence(s) on 4 dimensions, as did two external raters.
I'd like to create a table, grouped by participant, with columns showing these values:
Participants' rate of agreement with rater 1 (par1), with rater 2 (par2) and overall (paro)
Participants' rate of agreement for each dimension with rater 1 (pad1.1, pad2.1 etc.), with rater 2 (pad1.2, pad2.2 etc.) and overall (pad1.o, pad2.o etc.)
Mean difference in rating between participant and rater 1 (mdrp1), rater 2 (mdrp2) and both raters (mdrpo)
Mean difference in rating for each dimension between participant and rater 1 (mdr1p1, mdr2p1 etc.), rater 2 (mdr1p2, mdr2p2 etc.) and both raters (mdr1po, mdr2po etc.)
(So with 4 dimensions there should be 30 values per participant)
Due to the size and structure of the data, I'm not sure where to start on this. I'm guessing that a loop would be necessary, but I've struggled to get my head around how to do that as well.
For agreement I'm considering adding TRUE/FALSE variables and then replacing them with 1 and 0 to eventually calculate agreement:
df <- df %>% mutate(par1 = (df$d1 == df$r1.1)
df <- df %>% mutate(par2 = (df$d1 == df$r2.1)
df <- df %>% mutate(paro = (df$d1 == df$r1.1 & df$d1 == df$r2.1)
And similarly for mean differences, adding variables with rating difference for each dimension...
df <- df %>% mutate(mdr1p1 = (df$d1 - df$r1.1))
df <- df %>% mutate(mdr1p2 = (df$d1 - df$r2.1))
df <- df %>% mutate(mdr1po = (df$d1 - ((df$r1.1 + df$r2.1)/2)))
...But these seem to be quite inefficient approaches!
My data looks like this:
ID Ans d1 d2 d3 d4 r1.1 r1.2 r1.3 r1.4 r2.1 r2.2 r2.3 r2.4
1 53 abc 3 3 3 3 3 2 4 3 3 2 4 3
2 a4 def 3 3 3 3 3 1 2 3 3 1 3 3
3 a4 ghi 4 4 4 4 3 2 5 1 3 1 5 2
4 hj jkl 3 3 3 3 3 1 3 3 3 1 5 3
5 32 mno 2 3 3 3 3 1 3 2 3 1 3 3
6 32 pqr 3 3 3 2 3 2 5 3 4 2 3 3
ID = participant
Ans = participants' written answer
d = dimension rated by participant
r1 = dimensions rated by external rater 1
r2 = dimensions rated by external rater 2
Example data:
structure(list(ID = c(1L, 2L, 2L, 3L, 4L, 4L, 5L),
Ans = c("abc", "def", "ghi", "jkl", "mno", "pqr", "stu"),
d1 = c(3L, 3L, 4L, 3L, 2L, 3L, 3L), d2 = c(3L, 3L, 4L, 3L, 3L, 3L, 1L),
d3 = c(3L, 3L, 4L, 3L, 3L, 3L, 1L), d4 = c(3L, 3L, 4L, 3L, 3L, 2L, 3L),
r1.1 = c(3L, 3L, 3L, 3L, 3L, 3L, 3L), r1.2 = c(2L, 1L, 2L, 1L, 1L, 2L, 3L),
r1.3 = c(4L, 2L, 5L, 3L, 3L, 5L, 3L), r1.4 = c(3L, 3L, 1L, 3L, 2L, 3L, 2L),
r2.1 = c(3L, 3L, 3L, 3L, 3L, 4L, 3L), r2.2 = c(2L, 1L, 1L, 1L, 1L, 2L, 1L),
r2.3 = c(4L, 3L, 5L, 5L, 3L, 3L, 5L), r2.4 = c(3L, 3L, 2L, 3L, 3L, 3L, 2L)),
row.names = c(1L, 2L, 3L, 4L, 5L, 6L), class = "data.frame")
Background
I want to loop over a grouped dataframe of factor variables to count the
occurrences of each value within a variable using count function from dplyr,
and I think that the purrr::map function would be the most suitable.
However, I cannot get this to work.
I tried to use this post for my needs, but this did not work either.
I also tried to hack together a function based on
this post, but could not get this to work with the grouping variable.
Question
Is it possible to loop over a grouped dataframe in the way that I want? If so,
how?
Thanks in advance for your consideration.
Reproducible example
library(tidyverse)
vars_df <-
structure(list(c = structure(c(2L, 3L, 3L, 2L, 3L, 3L, 2L, 2L,
1L, 2L, 2L, 2L, 3L, 1L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
2L, 3L, 1L, 2L, 1L, 3L, 3L, 2L, 1L, 2L, 2L, 2L, 3L, 3L, 2L, 1L,
1L, 2L, 3L, 3L, 2L, 2L, 3L, 3L, 3L, 2L), .Label = c("1", "2",
"3"), class = "factor"), pastpsyc = structure(c(2L, 1L, NA, 2L,
1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, NA, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), pastmed = structure(c(2L, 1L, NA, 2L,
1L, 1L, 1L, 1L, NA, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
2L, 2L, 1L, 1L, 1L, NA, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), hxsuicide = structure(c(2L, 1L, NA,
2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 2L, 1L, 1L, 2L, NA, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), hxdsh = structure(c(2L, 1L, NA, 1L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 2L, NA, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), hxtrauma = structure(c(2L, 1L, NA, 2L,
1L, 1L, 1L, 1L, NA, 1L, 1L, NA, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 1L, NA, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor")), row.names = c(NA, -50L), class = c("tbl_df",
"tbl", "data.frame"))
map_dfr(setNames(c('1', '2', '3'),
c('1', '2', '3')), ~
vars_df %>%
group_by(c) %>%
summarise(across(everything(), function(x)
sum(x == .x, na.rm = TRUE)), .groups = 'drop'), .id = 'var')
#> # A tibble: 9 x 7
#> var c pastpsyc pastmed hxsuicide hxdsh hxtrauma
#> <chr> <fct> <int> <int> <int> <int> <int>
#> 1 1 1 3 2 2 5 1
#> 2 1 2 16 9 18 16 10
#> 3 1 3 12 3 8 11 9
#> 4 2 1 0 0 0 0 0
#> 5 2 2 0 0 0 0 0
#> 6 2 3 0 0 0 0 0
#> 7 3 1 0 0 0 0 0
#> 8 3 2 0 0 0 0 0
#> 9 3 3 0 0 0 0 0
vars_df %>%
group_by(c) %>%
count(pastpsyc)
#> # A tibble: 7 x 3
#> # Groups: c [3]
#> c pastpsyc n
#> <fct> <fct> <int>
#> 1 1 0 4
#> 2 1 1 3
#> 3 2 0 8
#> 4 2 1 16
#> 5 3 0 5
#> 6 3 1 12
#> 7 3 <NA> 2
vars_df %>%
group_by(c) %>%
map(~ count(.))
#> Error in UseMethod("count"): no applicable method for 'count' applied to an object of class "factor"
.get_count <-
function(mygroup) {
quo_var <- enquo(mygroup)
vars_df %>%
group_by(!! quo_var) %>%
count() %>%
ungroup()
}
vars <-
vars_df %>%
colnames()
vars %>%
syms() %>%
map(function(var) .get_count(!!var))
#> [[1]]
#> # A tibble: 3 x 2
#> c n
#> <fct> <int>
#> 1 1 7
#> 2 2 24
#> 3 3 19
#>
#> [[2]]
#> # A tibble: 3 x 2
#> pastpsyc n
#> <fct> <int>
#> 1 0 17
#> 2 1 31
#> 3 <NA> 2
#>
#> [[3]]
#> # A tibble: 3 x 2
#> pastmed n
#> <fct> <int>
#> 1 0 33
#> 2 1 14
#> 3 <NA> 3
#>
#> [[4]]
#> # A tibble: 3 x 2
#> hxsuicide n
#> <fct> <int>
#> 1 0 20
#> 2 1 28
#> 3 <NA> 2
#>
#> [[5]]
#> # A tibble: 3 x 2
#> hxdsh n
#> <fct> <int>
#> 1 0 16
#> 2 1 32
#> 3 <NA> 2
#>
#> [[6]]
#> # A tibble: 3 x 2
#> hxtrauma n
#> <fct> <int>
#> 1 0 26
#> 2 1 20
#> 3 <NA> 4
vars %>%
syms() %>%
group_by(c) %>%
map(function(var) .get_count(!!var))
#> Error in UseMethod("group_by"): no applicable method for 'group_by' applied to an object of class "list"
# Created on 2021-05-26 by the reprex package (v2.0.0)
You can use map as -
library(tidyverse)
vars %>% map(~vars_df %>% count(c, .data[[.x]]))
#[[1]]
# A tibble: 3 x 2
# c n
# <fct> <int>
#1 1 7
#2 2 24
3 3 19
#[[2]]
# A tibble: 7 x 3
# c pastpsyc n
# <fct> <fct> <int>
#1 1 0 4
#2 1 1 3
#3 2 0 8
#4 2 1 16
#5 3 0 5
#6 3 1 12
#7 3 NA 2
#...
#...
A different way to show the output in a long format -
vars_df %>% pivot_longer(cols = -c) %>% count(c, name, value)
# c name value n
# <fct> <chr> <fct> <int>
# 1 1 hxdsh 0 2
# 2 1 hxdsh 1 5
# 3 1 hxsuicide 0 5
# 4 1 hxsuicide 1 2
# 5 1 hxtrauma 0 5
# 6 1 hxtrauma 1 1
# 7 1 hxtrauma NA 1
# 8 1 pastmed 0 4
# 9 1 pastmed 1 2
#10 1 pastmed NA 1
# … with 28 more rows
I need to get the centroid for each cluster computed by the hierarchical method.
First, this is a part of my dataset to get reproductible example:
> dput(DATABASE[1:20,])
structure(list(TYPE_PEAU = c(2L, 2L, 3L, 2L, 2L, 2L, 2L, 4L,
3L, 2L, 2L, 2L, 2L, 1L, 4L, 2L, 2L, 2L, 4L, 2L), SENSIBILITE = c(3L,
2L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 3L,
1L, 3L, 3L), IMPERFECTIONS = c(2L, 2L, 3L, 3L, 1L, 2L, 2L, 3L,
2L, 2L, 2L, 1L, 1L, 1L, 3L, 1L, 2L, 1L, 2L, 2L), BRILLANCE = c(3L,
3L, 1L, 3L, 1L, 3L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L,
3L, 3L, 3L), GRAIN_PEAU = c(3L, 3L, 3L, 1L, 3L, 3L, 3L, 2L, 3L,
2L, 1L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 3L), RIDES_VISAGE = c(3L,
1L, 1L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 2L, 1L, 3L, 1L, 3L, 3L, 3L,
3L, 3L, 3L), MAINS = c(2L, 2L, 3L, 3L, 1L, 1L, 1L, 3L, 3L, 3L,
3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L), PEAU_CORPS = c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 3L, 2L, 3L, 2L, 3L, 2L,
2L, 1L), INTERET_ALIM_NATURELLE = c(1L, 1L, 3L, 1L, 1L, 1L, 1L,
1L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 1L, 1L, 1L), INTERET_ORIGINE_GEO = c(1L,
1L, 2L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 1L, 3L, 3L, 1L,
1L, 1L, 1L), INTERET_VACANCES = c(1L, 2L, 3L, 1L, 1L, 1L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 3L, 1L, 2L), INTERET_ENVIRONNEMENT = c(1L,
3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), AGE_INTERVAL = c(3L, 3L, 4L, 2L, 2L, 3L, 3L, 4L,
4L, 3L, 4L, 2L, 1L, 3L, 3L, 2L, 2L, 2L, 2L, 3L), ATTENTE_BEAUTE_1 = c(1L,
6L, 4L, 4L, 6L, 6L, 3L, 1L, 1L, 4L, 3L, 6L, 2L, 5L, 5L, 6L, 7L,
4L, 6L, 3L), ATTENTE_BEAUTE_2 = c(2L, 2L, 3L, 6L, 4L, 1L, 4L,
7L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 2L, 6L, 2L, 2L, 2L), MILIEU_VIE = c(1L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), PROFIL_SELECTIONNE = c(1L, 32L, 21L, 23L, 34L, 31L,
15L, 6L, 1L, 20L, 14L, 34L, 9L, 28L, 28L, 32L, 42L, 20L, 32L,
14L), NOMBRE_ACHAT = c(14L, 6L, 3L, 9L, 8L, 13L, 10L, 14L, 4L,
3L, 10L, 8L, 12L, 3L, 7L, 6L, 4L, 13L, 3L, 3L), NOMBRE_CADEAU = c(2L,
1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
2L, 1L, 1L)), .Names = c("TYPE_PEAU", "SENSIBILITE", "IMPERFECTIONS",
"BRILLANCE", "GRAIN_PEAU", "RIDES_VISAGE", "MAINS", "PEAU_CORPS",
"INTERET_ALIM_NATURELLE", "INTERET_ORIGINE_GEO", "INTERET_VACANCES",
"INTERET_ENVIRONNEMENT", "AGE_INTERVAL", "ATTENTE_BEAUTE_1",
"ATTENTE_BEAUTE_2", "MILIEU_VIE", "PROFIL_SELECTIONNE", "NOMBRE_ACHAT",
"NOMBRE_CADEAU"), row.names = c(NA, 20L), class = "data.frame")
then I used as follow :
mydist = dist(DATABASE)
clusters = cutree(hclust(mydist),k=3)
> clusters
[1] 1 2 3 3 2 2 3 1 1 3 1 2 1 3 2 2 2 3 2 1 3 2 1 1 1 1 2 1 2 1 3 3 2 3 2 2 1 1 1 1 3 2 1 1 3 2 1 2 2 1 2 2 3 1 3 1 3
[58] 1 3 2 2 1 1 2 1 2 2 2 3 2 3 1 2 2 1 1 3 3 2 1 2 2 1 2 3 3 3 1 2 1 2 1 1 1 1 1 3 2 2 2 1 1 3 2 2 1 1 1 2 1 1 1 1 3
[115] 1 2 2 1 2 3 1 1 2 3 1 1 1 2 1 3 1 2 3 2 2 1 2 1 1 3 3 2 1 2 2 1 1 1 1 2 1 2 2 3 3 1 1 3 1 3 3 3 3 2 3 1 2 3 3 3 1
[172] 1 2 2 1 1 2 1 2 2 1 3 3 1 2 2 1 1 1 2 2 1 1 1 1 3 2 3 3 1 1 2 2 2 3 1 1 1 2 2 1 2 1 3 1 2 1 3 3 1 1 1 1 2 1 2 2 2
[229] 3 3 1 1 2 1 3 2 2 2 1 1 2 1 3 1 2 1 3 1 3 1 3 1 1 1 1 2 2 1 3 3 3 2 1 2 3 2 2 1 1 3 1 2 3 1 1 2 1 1 1 1 2 2 2 3 2
[286] 1 2 1 1 2 1 2 1 2 2 1 2 3 1 3 1 3 1 1 3 1 1 2 2 1 3 3 2 2 1 2 1 1 2 2 1 3 3 2 2 1 3 3 3 1 1 1 1 3 3 2 1 3 1 2 1 2
[343] 1 2 3 3 2 3 1 3 2 3 3 1 2 2 1 2 2 3 2 1 3 2 2 1 2 3 2 3 3 3 2 2 3 2 1 1 1 2 3 2 2 1 2 2 2 1 2 1 1 1 3 1 2 2 1 1 2
[400] 1 1 1 1 1 2 2 2
Please Note that the objectif is to compute the inter and intra inertia:
So i need to compute the distance between each centroid and all points that are included in its cluster.
So I need to compute the distance between each centroid and its concerned cluster
to used then for computing the inter and intra inertia.
You can define the centroids as the means of variables, per cluster, in DATABASE.
mydist <- dist(DATABASE)
clusters <- cutree(hclust(mydist), k = 3)
## Col means in each cluster
apply(DATABASE, 2, function (x) tapply(x, clusters, mean))
## or
DATABASE$cluster <- clusters # add cluster to DATABASE
# Now take means per group
library(dplyr)
centroids <- DATABASE %>%
group_by(cluster) %>%
summarise_all(funs(mean))
## Distance between centroids
dist(centroids[, -1], method = "euclidean")
## Example for distance in cluster 1 (distance between all observations of cluster 1)
DATABASE %>%
filter(cluster == 1) %>%
select(-cluster) %>%
dist()
you might want to specify your k value into 1:3 not just 3
here is the code and how to find the center (mean)
Suppose I have data which looks like this
Id Name Price sales Profit Month Category Mode Supplier
1 A 2 5 8 1 X K John
1 A 2 6 9 2 X K John
1 A 2 5 8 3 X K John
2 B 2 4 6 1 X L Sam
2 B 2 3 4 2 X L Sam
2 B 2 5 7 3 X L Sam
3 C 2 5 11 1 X M John
3 C 2 5 11 2 X L John
3 C 2 5 11 3 X K John
4 D 2 8 10 1 Y M John
4 D 2 8 10 2 Y K John
4 D 2 5 7 3 Y K John
5 E 2 5 9 1 Y M Sam
5 E 2 5 9 2 Y L Sam
5 E 2 5 9 3 Y M Sam
6 F 2 4 7 1 Z M Kyle
6 F 2 5 8 2 Z L Kyle
6 F 2 5 8 3 Z M Kyle
if I apply table function, it will just combines are the rows and result will be
K L M
X 4 4 1
Y 2 1 3
Z 0 1 2
Now what if I want not the sum of all rows but only sum of those rows with Unique Id
so it looks like
K L M
X 2 2 1
Y 1 1 2
Z 0 1 1
Thanks
If df is your data.frame:
# Subset original data.frame to keep columns of interest
df1 <- df[,c("Id", "Category", "Mode")]
# Remove duplicated rows
df1 <- df1[!duplicated(df1),]
# Create table
with(df1, table(Category, Mode))
# Mode
# Category K L M
# X 2 2 1
# Y 1 1 2
# Z 0 1 1
Or in one line using unique
table(unique(df[c("Id", "Category", "Mode")])[-1])
df <- structure(list(Id = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L), Name = structure(c(1L, 1L, 1L,
2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L), .Label = c("A",
"B", "C", "D", "E", "F"), class = "factor"), Price = c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), sales = c(5L, 6L, 5L, 4L, 3L, 5L, 5L, 5L, 5L, 8L, 8L, 5L,
5L, 5L, 5L, 4L, 5L, 5L), Profit = c(8L, 9L, 8L, 6L, 4L, 7L, 11L,
11L, 11L, 10L, 10L, 7L, 9L, 9L, 9L, 7L, 8L, 8L), Month = c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L), Category = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("X", "Y", "Z"
), class = "factor"), Mode = structure(c(1L, 1L, 1L, 2L, 2L,
2L, 3L, 2L, 1L, 3L, 1L, 1L, 3L, 2L, 3L, 3L, 2L, 3L), .Label = c("K",
"L", "M"), class = "factor"), Supplier = structure(c(1L, 1L,
1L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L
), .Label = c("John", "Kyle", "Sam"), class = "factor")), .Names = c("Id",
"Name", "Price", "sales", "Profit", "Month", "Category", "Mode",
"Supplier"), class = "data.frame", row.names = c(NA, -18L))
We can try
library(data.table)
dcast(unique(setDT(df1[c('Category', 'Mode', 'Id')])),
Category~Mode, value.var='Id', length)
# Category K L M
#1: X 2 2 1
#2: Y 1 1 2
#3: Z 0 1 1
Or with dplyr
library(dplyr)
df1 %>%
distinct(Id, Category, Mode) %>%
group_by(Category, Mode) %>%
tally() %>%
spread(Mode, n, fill=0)
# Category K L M
# (chr) (dbl) (dbl) (dbl)
#1 X 2 2 1
#2 Y 1 1 2
#3 Z 0 1 1
Or as #David Arenburg suggested, a variant of the above is
df1 %>%
distinct(Id, Category, Mode) %>%
select(Category, Mode) %>%
table()
I have two data frames
df1 <- structure(list(g1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), g2 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"), val1 = 1:20, val2 = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 4L, 1L, 2L, 3L)), .Names = c("g1", "g2", "val1", "val2"), row.names = c(NA, -20L), class = "data.frame")
df2 <- structure(list(g1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), g2 = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L), .Label = c("a", "b", "c"), class = "factor"), val3 = c(5L, 6L, 7L, 3L, 4L, 5L, 2L, 3L, 4L, 8L, 9L, 10L, 4L, 5L, 6L, 5L, 6L)), .Names = c("g1", "g2", "val3"), row.names = c(NA, -17L), class = "data.frame")
> df1
g1 g2 val1 val2
1 A a 1 1
2 A a 2 2
3 A a 3 3
4 A a 4 4
5 A b 5 1
6 A b 6 2
7 A b 7 3
8 A c 8 1
9 A c 9 2
10 A c 10 3
11 B a 11 1
12 B a 12 2
13 B a 13 3
14 B b 14 1
15 B b 15 2
16 B b 16 3
17 B b 17 4
18 B c 18 1
19 B c 19 2
20 B c 20 3
> df2
g1 g2 val3
1 A a 5
2 A a 6
3 A a 7
4 A b 3
5 A b 4
6 A b 5
7 A c 2
8 A c 3
9 B c 4
10 B a 8
11 B a 9
12 B a 10
13 B b 4
14 B b 5
15 B b 6
16 B c 5
17 B c 6
My aim is to rescale df1$val2 to take values between the min and max values of df2$val3 within the respective groups.
I tried this:
library(dplyr)
df1 <- df1 %.% group_by(g1, g2) %.% mutate(rescaled=(max(df2$val3)-min(df2$val3))*(val2-min(val2))/(max(val2)-min(val2))+min(df2$val3))
But the output is different from what I expect. The problem is that I can neither cbind nor merge the two data frames due to their different lengths. Any hints?
Does this work?
library(plyr)
df3 <- ddply(df2, .(g1, g2), summarize, max.val=max(val3), min.val=min(val3))
merged.df <- merge(df1, df3, by=c("g1", "g2"), all.x=TRUE)
## Now rescale merged.df$val2 as desired