Use tidyr::pivot_longer for multiple measurements with uncertainties - r

A common type of data set I come across contains several measurements with associated uncertainties combined in each row. Here's an example:
structure(list(meas1 = c(150.3197, 19.95853, 161.40022, 103.23733, 140.28786, 193.42983, 75.237556, 207.84688, 116.4379, 80.251797 ), unc1 = c(0.038140954, 0.09151666, 0.035390881, 0.043274285, 0.03396304, 0.033362432, 0.05290015, 0.035449262, 0.038330437, 0.049171039), meas2 = c(1270.5522, 562.92518, 940.65152, 696.6982, 380.22449, 1979.0521, 1022.01, 1269.7508, 1686.6116, 1256.0033 ), unc2 = c(0.06063558, 0.061388181, 0.060714985, 0.061178737, 0.061318833, 0.060302475, 0.060876815, 0.060659146, 0.060412551, 0.060635459), meas3 = c(601.11331, 1675.2958, 608.84736, 998.76837, 266.2926, 2933.9751, 1682.3191, 775.43699, 428.29473, 1393.6564 ), unc3 = c(0.103445147, 0.102309634, 0.103147224, 0.101772166, 0.104186185, 0.101292496, 0.101556363, 0.102983978, 0.10394405, 0.101598249), ID = 1:10), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"))
I want to get it in a tidy configuration, like this:
ID meas_type reading uncert
1 1 meas1 150.31970 0.03814095
2 1 meas2 1270.55220 0.06063558
3 1 meas3 601.11331 0.10344515
4 2 meas1 19.95853 0.09151666
5 2 meas2 562.92518 0.06138818
6 2 meas3 1675.29580 0.10230963 ...
I have a work-around, but am wondering if there isn't a pivot_longer() method that would do this more elegantly.
Here's my klugey solution:
df_vals <- df_raw %>%
pivot_longer(cols = c("meas1", "meas2", "meas3"),
names_to = "meas_type",
values_to = "reading")
df_vals <- df_vals[, 4:6]
df_unc <- df_raw %>%
pivot_longer(cols = starts_with("unc"),
values_to = "uncert")
df_unc <- df_unc[, 4:6]
df <- cbind(df_vals, "uncert" = df_unc$uncert)

We can use names_pattern argument of pivot_longer.
tidyr::pivot_longer(df, cols = -ID,
names_to = c(".value", "meas_type"),
names_pattern = "(.*)(\\d+)")
# A tibble: 30 x 4
# ID meas_type meas unc
# <int> <chr> <dbl> <dbl>
# 1 1 1 150. 0.0381
# 2 1 2 1271. 0.0606
# 3 1 3 601. 0.103
# 4 2 1 20.0 0.0915
# 5 2 2 563. 0.0614
# 6 2 3 1675. 0.102
# 7 3 1 161. 0.0354
# 8 3 2 941. 0.0607
# 9 3 3 609. 0.103
#10 4 1 103. 0.0433
# … with 20 more rows

In case you would consider a base R solution, you would need to use a data frame not a tibble but this does what you want..
d <- as.data.frame(d)
reshape(data=d, varying=1:6,
timevar="meas_type",
direction="long",
sep="")
ID meas_type meas unc
1.1 1 1 150.31970 0.03814095
2.1 2 1 19.95853 0.09151666
3.1 3 1 161.40022 0.03539088
4.1 4 1 103.23733 0.04327429
5.1 5 1 140.28786 0.03396304
6.1 6 1 193.42983 0.03336243

We can use melt from data.table
library(data.table)
melt(setDT(df1), measure = patterns("^unc", "meas"),
value.name = c("unc", "meas"), variable.name = "meas_type")
# ID meas_type unc meas
# 1: 1 1 0.03814095 150.31970
# 2: 2 1 0.09151666 19.95853
# 3: 3 1 0.03539088 161.40022
# 4: 4 1 0.04327429 103.23733
# 5: 5 1 0.03396304 140.28786
# 6: 6 1 0.03336243 193.42983
# 7: 7 1 0.05290015 75.23756
# 8: 8 1 0.03544926 207.84688
# 9: 9 1 0.03833044 116.43790
#10: 10 1 0.04917104 80.25180
#11: 1 2 0.06063558 1270.55220
#...

Related

Trying to make a new column using R [duplicate]

I have a data frame (all_data) in which I have a list of sites (1... to n) and their scores e.g.
site score
1 10
1 11
1 12
4 10
4 11
4 11
8 9
8 8
8 7
I want create a column that numbers each level of site in numerical order, like a counter. In the example, the sites (1, 4, and 8) would be have a corresponding counter from 1 to 3 in the 'number' column:
site score number
1 10 1
1 11 1
1 12 1
4 10 2
4 11 2
4 11 2
8 9 3
8 8 3
8 7 3
I am sure this must be easily solved, but I have not found a way yet.
Try Data$number <- as.numeric(as.factor(Data$site))
On a sidenote : the difference between the solution of me and #Chase on one hand, and the one of #DWin on the other, is the ordering of the numbers. Both as.factor and factor will automatically sort the levels, whereas that doesn't happen in the solution of #DWin :
Dat <- data.frame(site = rep(c(1,8,4), each = 3), score = runif(9))
Dat$number <- as.numeric(factor(Dat$site))
Dat$sitenum <- match(Dat$site, unique(Dat$site) )
Gives
> Dat
site score number sitenum
1 1 0.7377561 1 1
2 1 0.3131139 1 1
3 1 0.7862290 1 1
4 8 0.4480387 3 2
5 8 0.3873210 3 2
6 8 0.8778102 3 2
7 4 0.6916340 2 3
8 4 0.3033787 2 3
9 4 0.6552808 2 3
In the new dplyr 1.0.0 we can use cur_group_id() which gives a unique numeric identifier to a group.
library(dplyr)
df %>% group_by(site) %>% mutate(number = cur_group_id())
# site score number
# <int> <int> <int>
#1 1 10 1
#2 1 11 1
#3 1 12 1
#4 4 10 2
#5 4 11 2
#6 4 11 2
#7 8 9 3
#8 8 8 3
#9 8 7 3
data
df <- structure(list(site = c(1L, 1L, 1L, 4L, 4L, 4L, 8L, 8L, 8L),
score = c(10L, 11L, 12L, 10L, 11L, 11L, 9L, 8L, 7L)),
class = "data.frame", row.names = c(NA, -9L))
Two other options:
1) Using the .GRP function from the data.table package:
library(data.table)
setDT(dat)[, num := .GRP, by = site]
with the example dataset from below this results in:
> dat
site score num
1: 1 0.14945795 1
2: 1 0.60035697 1
3: 1 0.94643075 1
4: 8 0.68835336 2
5: 8 0.50553372 2
6: 8 0.37293624 2
7: 4 0.33580504 3
8: 4 0.04825135 3
9: 4 0.61894754 3
10: 8 0.96144729 2
11: 8 0.65496051 2
12: 8 0.51029199 2
2) Using the group_indices function from dplyr:
dat$num <- group_indices(dat, site)
or when you want to work around non-standard evaluation:
library(dplyr)
dat %>%
mutate(num = group_indices_(dat, .dots = c('site')))
which results in:
site score num
1 1 0.42480366 1
2 1 0.98736177 1
3 1 0.35766187 1
4 8 0.06243182 3
5 8 0.55617002 3
6 8 0.20304632 3
7 4 0.90855921 2
8 4 0.25215078 2
9 4 0.44981251 2
10 8 0.60288270 3
11 8 0.46946587 3
12 8 0.44941782 3
As can be seen, dplyr gives a different order of the group numbers.
If you want another number every time the group changes, there are several other options:
1) with base R:
# option 1:
dat$num <- cumsum(c(TRUE, head(dat$site, -1) != tail(dat$site, -1)))
# option 2:
x <- rle(dat$site)$lengths
dat$num <- rep(seq_along(x), times=x)
2) with the data.table package:
library(data.table)
setDT(dat)[, num := rleid(site)]
which all result in:
> dat
site score num
1 1 0.80817855 1
2 1 0.07881334 1
3 1 0.60092828 1
4 8 0.71477988 2
5 8 0.51384565 2
6 8 0.72011650 2
7 4 0.74994627 3
8 4 0.09564052 3
9 4 0.39782587 3
10 8 0.29446540 4
11 8 0.61725367 4
12 8 0.97427413 4
Used data:
dat <- data.frame(site = rep(c(1,8,4,8), each = 3), score = runif(12))
This should be fairly efficient and understandable:
Dat$sitenum <- match(Dat$site, unique(Dat$site))
Using the data from #Jaap, a different dplyr possibility using dense_rank() could be:
dat %>%
mutate(ID = dense_rank(site))
site score ID
1 1 0.1884490 1
2 1 0.1087422 1
3 1 0.7438149 1
4 8 0.1150771 3
5 8 0.9978203 3
6 8 0.7781222 3
7 4 0.4081830 2
8 4 0.2782333 2
9 4 0.9566959 2
10 8 0.2545320 3
11 8 0.1201062 3
12 8 0.5449901 3
Or a rleid()-like dplyr approach, with the data arranged first:
dat %>%
arrange(site) %>%
mutate(ID = with(rle(site), rep(seq_along(lengths), lengths)))
site score ID
1 1 0.1884490 1
2 1 0.1087422 1
3 1 0.7438149 1
4 4 0.4081830 2
5 4 0.2782333 2
6 4 0.9566959 2
7 8 0.1150771 3
8 8 0.9978203 3
9 8 0.7781222 3
10 8 0.2545320 3
11 8 0.1201062 3
12 8 0.5449901 3
Or using duplicated() and cumsum():
df %>%
mutate(ID = cumsum(!duplicated(site)))
The same with base R:
df$ID <- with(rle(df$site), rep(seq_along(lengths), lengths))
Or:
df$ID <- cumsum(!duplicated(df$site))
You can turn site into a factor and then return the numeric or integer values of that factor:
dat <- data.frame(site = rep(c(1,4,8), each = 3), score = runif(9))
dat$number <- as.integer(factor(dat$site))
dat
site score number
1 1 0.5305773 1
2 1 0.9367732 1
3 1 0.1831554 1
4 4 0.4068128 2
5 4 0.3438962 2
6 4 0.8123883 2
7 8 0.9122846 3
8 8 0.2949260 3
9 8 0.6771526 3
Another solution using the data.table package.
Example with the more complete datset provided by Jaap:
setDT(dat)[, number := frank(site, ties.method = "dense")]
dat
site score number
1: 1 0.3107920 1
2: 1 0.3640102 1
3: 1 0.1715318 1
4: 8 0.7247535 3
5: 8 0.1263025 3
6: 8 0.4657868 3
7: 4 0.6915818 2
8: 4 0.3558270 2
9: 4 0.3376173 2
10: 8 0.7934963 3
11: 8 0.9641918 3
12: 8 0.9832120 3
Another way to do it. That I think is easy to get even when you know little about R:
library(dplyr)
df <- data.frame('site' = c(1, 1, 1, 4, 4, 4, 8, 8, 8))
df <- mutate(df, 'number' = cumsum(site != lag(site, default=-1)))
I too recently needed a solution to this. Didn't find this thread, started mine and was re-directed here (thank you). Good to see many solutions but to me (and I feel is good practice), a scalable solution is important. Hence, benchmarked several solutions below.
df <- data.table(country = rep(c('a', 'b', 'b', 'c', 'c', 'c'), 1e7)
)
a <-
microbenchmark(factor = {df[, group_id := as.integer(factor(country))]}
, unique_match = df[, group_id := match(country, unique(country))]
, rle = df[ , group_id := with(rle(country), rep(seq_along(lengths), lengths))]
, dup_cumsum = df[, group_id := cumsum(!duplicated(country))]
, frank = df[, group_id := frank(country, ties.method = "dense")]
, GRP = df[, group_id := .GRP, country]
, rleid = df[, group_id := rleid(country)]
, cumsum_head_tail = df[, group_id := cumsum(c(TRUE, head(country, -1) != tail(country, -1)))]
, times = 50
)
autoplot(a)
It would appear the podium is held by data.table.
Still, was great to learn of alternatives e.g. cumsum(!duplicated(country)). What a brainteaser!
Using collapse::group, Fast Hash-Based Grouping:
library(collapse)
d = data.frame(site = rep(c(1,8,4), each = 3))
settransform(d, number = group(site)) # settransform updates data by reference. See also ftransform.
d
# site number
# 1 1 1
# 2 1 1
# 3 1 1
# 4 8 2
# 5 8 2
# 6 8 2
# 7 4 3
# 8 4 3
# 9 4 3
The collapse functions are considerably faster on larger data. Here I compare with two common base idioms (factor / as.integer; match / unique), and two data.table methods (.GRP; frank), using 1e5 groups with 1e3 rows each.
library(data.table)
library(microbenchmark)
nr = 1e3
ng = 1e5
set.seed(1)
d1 = data.table(g = sample(1:ng, nr*ng, replace = TRUE))
d2 = copy(d1)
d3 = copy(d1)
d4 = copy(d1)
d5 = copy(d1)
microbenchmark(
factor = {d1[ , gi := as.integer(factor(g))]},
unique_match = {d2[, gi := match(g, unique(g))]},
frank = {d3[, gi := frank(g, ties.method = "dense")]},
GRP = {d4[, gi := .GRP, by = g]},
collap = {settransform(d5, gi = group(g))},
times = 20L)
# Unit: milliseconds
# expr min lq mean median uq max
# factor 46648.6099 48493.0146 49918.956 49336.208 51547.0789 53585.472
# unique_match 12662.0978 13057.3210 13534.391 13530.457 13998.0141 14407.036
# frank 2628.4923 2695.7064 3240.522 2833.950 3797.5579 5547.227
# GRP 2754.2153 3283.2444 3796.109 3717.239 4184.5174 5117.918
# collap 640.1993 668.2301 729.351 698.307 753.2932 1086.592
# Check equality with data.table .GRP. Use as.vector to remove attributes
all.equal(d4$gi, as.vector(d5$gi))
# [1] TRUE
Note: group, .GRP and match / unique all create the group number according to order of appearance of the different values in the original data (also discussed in previous posts).
If you want to keep your existing columns and assign back to the same data frame...
my_df <- my_df %>%
select(everything()) %>%
group_by(geo) %>%
mutate(geo_id = cur_group_id())
And you can do multiple columns like so...
my_df <- my_df %>%
select(everything()) %>%
group_by(geo) %>%
mutate(geo_id = cur_group_id()) %>%
group_by(state) %>%
mutate(state_id = cur_group_id()) %>%
group_by(name) %>%
mutate(name_id = cur_group_id())
If the numbers of the site column were unordered, we could use as_factor() in combination with fct_inorder() from the forcats package:
library(tibble)
library(dplyr)
library(forcats)
all_data_unordered <- tibble(site = c(1,1,1,8,8,8,4,4,4),
score = c(10,11,12,10,11,11,9,8,7))
all_data_unordered |>
mutate(number = as_factor(site) |> fct_inorder() |> as.integer())
#> # A tibble: 9 × 3
#> site score number
#> <dbl> <dbl> <int>
#> 1 1 10 1
#> 2 1 11 1
#> 3 1 12 1
#> 4 8 10 2
#> 5 8 11 2
#> 6 8 11 2
#> 7 4 9 3
#> 8 4 8 3
#> 9 4 7 3
Created on 2021-11-05 by the reprex package (v2.0.1)
Since dplyr 1.1.0, another option is consecutive_id:
library(dplyr)
df %>%
mutate(id = consecutive_id(site))
# site score id
# 1 1 10 1
# 2 1 11 1
# 3 1 12 1
# 4 4 10 2
# 5 4 11 2
# 6 4 11 2
# 7 8 9 3
# 8 8 8 3
# 9 8 7 3
Note that consecutive_id, like data.table::rleid but unlike cur_group_id or as.numeric(factor(.)) will return an ID for consecutive values, meaning that if the same value appears not consecutively, it'll create a new id.
df <- structure(list(site = c(1L, 1L, 1L, 4L, 4L, 4L, 1L, 1L, 1L)),
class = "data.frame", row.names = c(NA, -9L))
df %>%
mutate(cons_id = consecutive_id(site)) %>%
group_by(site) %>%
mutate(cur_group_id = cur_group_id())
# site cons_id cur_group_id
# 1 1 1 1
# 2 1 1 1
# 3 1 1 1
# 4 4 2 2
# 5 4 2 2
# 6 4 2 2
# 7 1 3 1
# 8 1 3 1
# 9 1 3 1

tidyverse: data transformation, gather()

I am trying to transform a dataset:
[1]: https://i.stack.imgur.com/09Ioo.png
To something like this:
[2]: https://i.stack.imgur.com/vKKu2.png
How can I do this on R? I tried using gather() but somehow im not getting the results..
library(tidyverse)
df_gather <- df %>% gather(key = "Day", "Sensor",2:5)
View(df_gather)
Thanks in advance for your help!
Here is another tidyverse approach:
dat %>%
rename_with(., ~str_replace_all(., "Sensor", "Time_")) %>%
pivot_longer(-Date,
names_sep = "_",
names_to = c(".value", "Sensor")
)
Date Sensor Time
<int> <chr> <dbl>
1 1 1 155.
2 1 2 160.
3 1 3 126.
4 1 4 162.
5 1 5 155.
6 2 1 126.
7 2 2 133.
8 2 3 155.
9 2 4 171.
10 2 5 154.
# … with 15 more rows
Because you did not provide the data in an easily reused form, here is a dummy data frame similar to yours:
dat <-structure(list(Date = 1:5, Sensor1 = c(154.501112480648, 125.564142037183,
184.578892146237, 155.085407197475, 176.232917583548), Sensor2 = c(159.958130051382,
132.943481742404, 100.740377581678, 178.590174368583, 182.851045904681
), Sensor3 = c(125.962588260882, 155.333150480874, 122.294128965586,
122.685094899498, 150.199430575594), Sensor4 = c(162.315403693356,
170.65782523714, 117.775949183851, 145.122508681379, 193.589874636382
), Sensor5 = c(154.887120774947, 154.432400292717, 139.244429254904,
180.038237478584, 160.314362798817)), class = "data.frame", row.names = c(NA,
-5L))
To transform the data to the form you showed, you can use pivot_longer (which superseded gather) and then changed the names as necessary.
dat |>
pivot_longer(cols = starts_with("Sensor")) |>
mutate(name = str_replace(name, "Sensor", "")) |>
rename(Day = Date, Sensor = name, Time = value)
# The result
# A tibble: 25 × 3
Day Sensor Time
<int> <chr> <dbl>
1 1 1 155.
2 1 2 160.
3 1 3 126.
4 1 4 162.
5 1 5 155.
6 2 1 126.
7 2 2 133.
8 2 3 155.
9 2 4 171.
10 2 5 154.
# … with 15 more rows

Pivot_longer to manipulate table

I would like to pivot variables nclaims, npatients, nproviders to show up underneath groups.
I believe I should be using pivot_longer but it doesn't work.
library(tidyr)
ptype <- c(0,1,2,0,1)
groups <- c(rep(1,3), rep(2,2))
nclaims <- c(10,23,32,12,8)
nproviders <- c(2,4,5,1,1)
npatients <- c(8, 20, 29, 9, 6)
dta <- data.frame(ptype=ptype, groups=groups, nclaims=nclaims, nproviders=nproviders, npatients=npatients)
table <- pivot_longer(everything(dta), names_to = "groups", values_to=c("nclaims", "npatients", "nproviders"))
Desired output:
We need to use pivot_longer, then pivot_wider:
dta %>%
pivot_longer(nclaims:npatients) %>%
# values_fill = 0 changes NA values to 0, as in your desired result
pivot_wider(names_from = ptype, values_from = value,
values_fill = 0)
groups name `0` `1` `2`
<dbl> <chr> <dbl> <dbl> <dbl>
1 1 nclaims 10 23 32
2 1 nproviders 2 4 5
3 1 npatients 8 20 29
4 2 nclaims 12 8 0
5 2 nproviders 1 1 0
6 2 npatients 9 6 0
another approach, using reshape2::recast()
library( reshape2 )
recast( dta, groups + variable ~ ptype, id.var = c("ptype", "groups") )
# groups variable 0 1 2
# 1 1 nclaims 10 23 32
# 2 1 nproviders 2 4 5
# 3 1 npatients 8 20 29
# 4 2 nclaims 12 8 NA
# 5 2 nproviders 1 1 NA
# 6 2 npatients 9 6 NA

R Rename Group Column [duplicate]

I have a data frame (all_data) in which I have a list of sites (1... to n) and their scores e.g.
site score
1 10
1 11
1 12
4 10
4 11
4 11
8 9
8 8
8 7
I want create a column that numbers each level of site in numerical order, like a counter. In the example, the sites (1, 4, and 8) would be have a corresponding counter from 1 to 3 in the 'number' column:
site score number
1 10 1
1 11 1
1 12 1
4 10 2
4 11 2
4 11 2
8 9 3
8 8 3
8 7 3
I am sure this must be easily solved, but I have not found a way yet.
Try Data$number <- as.numeric(as.factor(Data$site))
On a sidenote : the difference between the solution of me and #Chase on one hand, and the one of #DWin on the other, is the ordering of the numbers. Both as.factor and factor will automatically sort the levels, whereas that doesn't happen in the solution of #DWin :
Dat <- data.frame(site = rep(c(1,8,4), each = 3), score = runif(9))
Dat$number <- as.numeric(factor(Dat$site))
Dat$sitenum <- match(Dat$site, unique(Dat$site) )
Gives
> Dat
site score number sitenum
1 1 0.7377561 1 1
2 1 0.3131139 1 1
3 1 0.7862290 1 1
4 8 0.4480387 3 2
5 8 0.3873210 3 2
6 8 0.8778102 3 2
7 4 0.6916340 2 3
8 4 0.3033787 2 3
9 4 0.6552808 2 3
In the new dplyr 1.0.0 we can use cur_group_id() which gives a unique numeric identifier to a group.
library(dplyr)
df %>% group_by(site) %>% mutate(number = cur_group_id())
# site score number
# <int> <int> <int>
#1 1 10 1
#2 1 11 1
#3 1 12 1
#4 4 10 2
#5 4 11 2
#6 4 11 2
#7 8 9 3
#8 8 8 3
#9 8 7 3
data
df <- structure(list(site = c(1L, 1L, 1L, 4L, 4L, 4L, 8L, 8L, 8L),
score = c(10L, 11L, 12L, 10L, 11L, 11L, 9L, 8L, 7L)),
class = "data.frame", row.names = c(NA, -9L))
Two other options:
1) Using the .GRP function from the data.table package:
library(data.table)
setDT(dat)[, num := .GRP, by = site]
with the example dataset from below this results in:
> dat
site score num
1: 1 0.14945795 1
2: 1 0.60035697 1
3: 1 0.94643075 1
4: 8 0.68835336 2
5: 8 0.50553372 2
6: 8 0.37293624 2
7: 4 0.33580504 3
8: 4 0.04825135 3
9: 4 0.61894754 3
10: 8 0.96144729 2
11: 8 0.65496051 2
12: 8 0.51029199 2
2) Using the group_indices function from dplyr:
dat$num <- group_indices(dat, site)
or when you want to work around non-standard evaluation:
library(dplyr)
dat %>%
mutate(num = group_indices_(dat, .dots = c('site')))
which results in:
site score num
1 1 0.42480366 1
2 1 0.98736177 1
3 1 0.35766187 1
4 8 0.06243182 3
5 8 0.55617002 3
6 8 0.20304632 3
7 4 0.90855921 2
8 4 0.25215078 2
9 4 0.44981251 2
10 8 0.60288270 3
11 8 0.46946587 3
12 8 0.44941782 3
As can be seen, dplyr gives a different order of the group numbers.
If you want another number every time the group changes, there are several other options:
1) with base R:
# option 1:
dat$num <- cumsum(c(TRUE, head(dat$site, -1) != tail(dat$site, -1)))
# option 2:
x <- rle(dat$site)$lengths
dat$num <- rep(seq_along(x), times=x)
2) with the data.table package:
library(data.table)
setDT(dat)[, num := rleid(site)]
which all result in:
> dat
site score num
1 1 0.80817855 1
2 1 0.07881334 1
3 1 0.60092828 1
4 8 0.71477988 2
5 8 0.51384565 2
6 8 0.72011650 2
7 4 0.74994627 3
8 4 0.09564052 3
9 4 0.39782587 3
10 8 0.29446540 4
11 8 0.61725367 4
12 8 0.97427413 4
Used data:
dat <- data.frame(site = rep(c(1,8,4,8), each = 3), score = runif(12))
This should be fairly efficient and understandable:
Dat$sitenum <- match(Dat$site, unique(Dat$site))
Using the data from #Jaap, a different dplyr possibility using dense_rank() could be:
dat %>%
mutate(ID = dense_rank(site))
site score ID
1 1 0.1884490 1
2 1 0.1087422 1
3 1 0.7438149 1
4 8 0.1150771 3
5 8 0.9978203 3
6 8 0.7781222 3
7 4 0.4081830 2
8 4 0.2782333 2
9 4 0.9566959 2
10 8 0.2545320 3
11 8 0.1201062 3
12 8 0.5449901 3
Or a rleid()-like dplyr approach, with the data arranged first:
dat %>%
arrange(site) %>%
mutate(ID = with(rle(site), rep(seq_along(lengths), lengths)))
site score ID
1 1 0.1884490 1
2 1 0.1087422 1
3 1 0.7438149 1
4 4 0.4081830 2
5 4 0.2782333 2
6 4 0.9566959 2
7 8 0.1150771 3
8 8 0.9978203 3
9 8 0.7781222 3
10 8 0.2545320 3
11 8 0.1201062 3
12 8 0.5449901 3
Or using duplicated() and cumsum():
df %>%
mutate(ID = cumsum(!duplicated(site)))
The same with base R:
df$ID <- with(rle(df$site), rep(seq_along(lengths), lengths))
Or:
df$ID <- cumsum(!duplicated(df$site))
You can turn site into a factor and then return the numeric or integer values of that factor:
dat <- data.frame(site = rep(c(1,4,8), each = 3), score = runif(9))
dat$number <- as.integer(factor(dat$site))
dat
site score number
1 1 0.5305773 1
2 1 0.9367732 1
3 1 0.1831554 1
4 4 0.4068128 2
5 4 0.3438962 2
6 4 0.8123883 2
7 8 0.9122846 3
8 8 0.2949260 3
9 8 0.6771526 3
Another solution using the data.table package.
Example with the more complete datset provided by Jaap:
setDT(dat)[, number := frank(site, ties.method = "dense")]
dat
site score number
1: 1 0.3107920 1
2: 1 0.3640102 1
3: 1 0.1715318 1
4: 8 0.7247535 3
5: 8 0.1263025 3
6: 8 0.4657868 3
7: 4 0.6915818 2
8: 4 0.3558270 2
9: 4 0.3376173 2
10: 8 0.7934963 3
11: 8 0.9641918 3
12: 8 0.9832120 3
Another way to do it. That I think is easy to get even when you know little about R:
library(dplyr)
df <- data.frame('site' = c(1, 1, 1, 4, 4, 4, 8, 8, 8))
df <- mutate(df, 'number' = cumsum(site != lag(site, default=-1)))
I too recently needed a solution to this. Didn't find this thread, started mine and was re-directed here (thank you). Good to see many solutions but to me (and I feel is good practice), a scalable solution is important. Hence, benchmarked several solutions below.
df <- data.table(country = rep(c('a', 'b', 'b', 'c', 'c', 'c'), 1e7)
)
a <-
microbenchmark(factor = {df[, group_id := as.integer(factor(country))]}
, unique_match = df[, group_id := match(country, unique(country))]
, rle = df[ , group_id := with(rle(country), rep(seq_along(lengths), lengths))]
, dup_cumsum = df[, group_id := cumsum(!duplicated(country))]
, frank = df[, group_id := frank(country, ties.method = "dense")]
, GRP = df[, group_id := .GRP, country]
, rleid = df[, group_id := rleid(country)]
, cumsum_head_tail = df[, group_id := cumsum(c(TRUE, head(country, -1) != tail(country, -1)))]
, times = 50
)
autoplot(a)
It would appear the podium is held by data.table.
Still, was great to learn of alternatives e.g. cumsum(!duplicated(country)). What a brainteaser!
Using collapse::group, Fast Hash-Based Grouping:
library(collapse)
d = data.frame(site = rep(c(1,8,4), each = 3))
settransform(d, number = group(site)) # settransform updates data by reference. See also ftransform.
d
# site number
# 1 1 1
# 2 1 1
# 3 1 1
# 4 8 2
# 5 8 2
# 6 8 2
# 7 4 3
# 8 4 3
# 9 4 3
The collapse functions are considerably faster on larger data. Here I compare with two common base idioms (factor / as.integer; match / unique), and two data.table methods (.GRP; frank), using 1e5 groups with 1e3 rows each.
library(data.table)
library(microbenchmark)
nr = 1e3
ng = 1e5
set.seed(1)
d1 = data.table(g = sample(1:ng, nr*ng, replace = TRUE))
d2 = copy(d1)
d3 = copy(d1)
d4 = copy(d1)
d5 = copy(d1)
microbenchmark(
factor = {d1[ , gi := as.integer(factor(g))]},
unique_match = {d2[, gi := match(g, unique(g))]},
frank = {d3[, gi := frank(g, ties.method = "dense")]},
GRP = {d4[, gi := .GRP, by = g]},
collap = {settransform(d5, gi = group(g))},
times = 20L)
# Unit: milliseconds
# expr min lq mean median uq max
# factor 46648.6099 48493.0146 49918.956 49336.208 51547.0789 53585.472
# unique_match 12662.0978 13057.3210 13534.391 13530.457 13998.0141 14407.036
# frank 2628.4923 2695.7064 3240.522 2833.950 3797.5579 5547.227
# GRP 2754.2153 3283.2444 3796.109 3717.239 4184.5174 5117.918
# collap 640.1993 668.2301 729.351 698.307 753.2932 1086.592
# Check equality with data.table .GRP. Use as.vector to remove attributes
all.equal(d4$gi, as.vector(d5$gi))
# [1] TRUE
Note: group, .GRP and match / unique all create the group number according to order of appearance of the different values in the original data (also discussed in previous posts).
If you want to keep your existing columns and assign back to the same data frame...
my_df <- my_df %>%
select(everything()) %>%
group_by(geo) %>%
mutate(geo_id = cur_group_id())
And you can do multiple columns like so...
my_df <- my_df %>%
select(everything()) %>%
group_by(geo) %>%
mutate(geo_id = cur_group_id()) %>%
group_by(state) %>%
mutate(state_id = cur_group_id()) %>%
group_by(name) %>%
mutate(name_id = cur_group_id())
If the numbers of the site column were unordered, we could use as_factor() in combination with fct_inorder() from the forcats package:
library(tibble)
library(dplyr)
library(forcats)
all_data_unordered <- tibble(site = c(1,1,1,8,8,8,4,4,4),
score = c(10,11,12,10,11,11,9,8,7))
all_data_unordered |>
mutate(number = as_factor(site) |> fct_inorder() |> as.integer())
#> # A tibble: 9 × 3
#> site score number
#> <dbl> <dbl> <int>
#> 1 1 10 1
#> 2 1 11 1
#> 3 1 12 1
#> 4 8 10 2
#> 5 8 11 2
#> 6 8 11 2
#> 7 4 9 3
#> 8 4 8 3
#> 9 4 7 3
Created on 2021-11-05 by the reprex package (v2.0.1)
Since dplyr 1.1.0, another option is consecutive_id:
library(dplyr)
df %>%
mutate(id = consecutive_id(site))
# site score id
# 1 1 10 1
# 2 1 11 1
# 3 1 12 1
# 4 4 10 2
# 5 4 11 2
# 6 4 11 2
# 7 8 9 3
# 8 8 8 3
# 9 8 7 3
Note that consecutive_id, like data.table::rleid but unlike cur_group_id or as.numeric(factor(.)) will return an ID for consecutive values, meaning that if the same value appears not consecutively, it'll create a new id.
df <- structure(list(site = c(1L, 1L, 1L, 4L, 4L, 4L, 1L, 1L, 1L)),
class = "data.frame", row.names = c(NA, -9L))
df %>%
mutate(cons_id = consecutive_id(site)) %>%
group_by(site) %>%
mutate(cur_group_id = cur_group_id())
# site cons_id cur_group_id
# 1 1 1 1
# 2 1 1 1
# 3 1 1 1
# 4 4 2 2
# 5 4 2 2
# 6 4 2 2
# 7 1 3 1
# 8 1 3 1
# 9 1 3 1

Creating a new data frame using existing data

I would like to create a new data from my existing data frame "ab". The new data frame should look like "Newdf".
a<- c(1:5)
b<-c(11:15)
ab<-data.frame(C1=a,c2=b)
ab
df<-c(1,11,2,12,3,13,4,14,5,15)
CMT<-c(1:2)
CMT1<-rep.int(CMT,times=5)
Newdf<-data.frame(DV=df,Comp=CMT1)
Newdf
Can we use dplyr package? If yes, how?
More importantly than dplyr, you'd need tidyr:
library(tidyr)
library(dplyr)
ab %>%
gather(Comp, DV) %>%
mutate(Comp = recode(Comp, "C1" = 1, "c2" = 2))
# Comp DV
# 1 1 1
# 2 1 2
# 3 1 3
# 4 1 4
# 5 1 5
# 6 2 11
# 7 2 12
# 8 2 13
# 9 2 14
# 10 2 15
Using dplyr and tidyr gives you something close...
library(tidyr)
library(dplyr)
df2 <- ab %>%
mutate(Order=1:n()) %>%
gather(key=Comp,value=DV,C1,c2) %>%
arrange(Order) %>%
mutate(Comp=recode(Comp,"C1"=1,"c2"=2)) %>%
select(DV,Comp)
df2
DV Comp
1 1 1
2 11 2
3 2 1
4 12 2
5 3 1
6 13 2
7 4 1
8 14 2
9 5 1
10 15 2
Although the OP has asked for a dpylr solution, I felt challenged to look for a data.table solution. So, FWIW, here is an alternative approach using melt().
Note that this solution does not depend on specific column names in ab as the two other dplyr solutions do. In addition, it should be working for more than two columns in ab as well (untested).
library(data.table)
melt(setDT(ab, keep.rownames = TRUE), id.vars = "rn", value.name = "DV"
)[, Comp := rleid(variable)
][order(rn)][, c("rn", "variable") := NULL][]
# DV Comp
# 1: 1 1
# 2: 11 2
# 3: 2 1
# 4: 12 2
# 5: 3 1
# 6: 13 2
# 7: 4 1
# 8: 14 2
# 9: 5 1
#10: 15 2
Data
ab <- structure(list(C1 = 1:5, c2 = 11:15), .Names = c("C1", "c2"),
row.names = c(NA, -5L), class = "data.frame")
ab
# C1 c2
#1 1 11
#2 2 12
#3 3 13
#4 4 14
#5 5 15

Resources