Recorder Dataset in R based on Country - r

structure(list(Date = c("KW 52 / 2016", "KW 1 / 2017", "KW 2 / 2017",
"KW 3 / 2017"), Sales_AT = c(150L, 169L, 143L, 170L), Sales_CH = c(150L,
169L, 143L, 170L), Sales_GER = c(150L, 169L, 143L, 170L), Sales_HUN = c(134L,
139L, NA, 125L), Sales_JP = c(134L, NA, 142L, 125L), Sales_POL = c(127L,
175L, 150L, 141L), Sales_SWE = c(125L, NA, 159L, 131L), Sales_USA = c(169L,
159L, NA, 132L), difference_AT = c(NA, 19L, -26L, 27L), difference_CH = c(NA,
19L, -26L, 27L), difference_GER = c(NA, 19L, -26L, 27L), difference_HUN = c(NA,
5L, NA, -14L), difference_JP = c(NA, NA, 8L, -17L), difference_POL = c(NA,
48L, -25L, -9L), difference_SWE = c(NA, NA, 34L, -28L), difference_USA = c(NA,
-10L, NA, -27L)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-4L))
This is my dataset which looks like this:
A tibble: 4 x 17
Date Sales_AT Sales_CH Sales_GER Sales_HUN Sales_JP Sales_POL Sales_SWE Sales_USA difference_AT difference_CH difference_GER difference_HUN difference_JP difference_POL difference_SWE difference_USA
<chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 KW 52 / 2016 150 150 150 134 134 127 125 169 NA NA NA NA NA NA NA NA
2 KW 1 / 2017 169 169 169 139 NA 175 NA 159 19 19 19 5 NA 48 NA -10
3 KW 2 / 2017 143 143 143 NA 142 150 159 NA -26 -26 -26 NA 8 -25 34 NA
4 KW 3 / 2017 170 170 170 125 125 141 131 132 27 27 27 -14 -17 -9 -28 -27
I want to reorder the dataset to have the sales and difference column of each country next to each other.
I´m look for a dplyr solution which works like this, but in a dynamic way:
wide_result %>%
select(contains("AT"), contains("CH"), contains("HUN"), contains("JP"), contains("USA"))
Can anyone help me?

Using base R:
df[c(1, order(sub(".*_", "", names(df)[-1])) + 1)]

Here's a way we can do it. Basically, we put the names of the data into a tibble, extract the part of the name after the _ (when possible), and then sort by that extracted text.
names_sort <- tibble(nn = names(dat)) %>%
filter(nn != "Date") %>% # remove Date column, since we'll select that first
# replace everything before and up to _ with ""
mutate(names_fix = gsub(".*_", "", nn)) %>%
arrange(names_fix) %>%
pull(nn)
dat %>%
select(Date, names_sort)
# Date Sales_AT difference_AT Sales_CH difference_CH
# <chr> <int> <int> <int> <int>
# 1 KW 52 / 2016 150 NA 150 NA
# 2 KW 1 / 2017 169 19 169 19
# 3 KW 2 / 2017 143 -26 143 -26
# 4 KW 3 / 2017 170 27 170 27

You can use dplyr select_at:
vars <- c("CH", "AT")
df %>%
select_at(vars(one_of("Date",
paste0("Sales_", vars),
paste0("difference_", vars))))
# A tibble: 4 x 5
Date Sales_CH Sales_AT difference_CH difference_AT
<chr> <int> <int> <int> <int>
1 KW 52 / 2016 150 150 NA NA
2 KW 1 / 2017 169 169 19 19
3 KW 2 / 2017 143 143 -26 -26
4 KW 3 / 2017 170 170 27 27

Related

Dividing non-equal dataframes based on a group condition

I have two dataframes, with a similar strucure:
df_I <- structure(list(year = c("2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006", "2006"), code = c(0, 1110,
1120, 1130, 1220, 1230, 1310, 1320, 1330), `1` = c(1L,
8L, 2L, 2L, 0L, 2L, 0L, 1L, 0L), `2` = c(0L, 10L, 0L, 0L,
0L, 2L, 1L, 3L, 1L), `3` = c(4L, 2L, 1L, 2L, 0L, 4L,
0L, 0L, 3L), `4` = c(4L, 6L, 0L, 3L, 1L, 3L, 0L, 0L, 3L),
totaal = c(11, 26, 3, 7, 1, 9, 7, 7, 6)), row.names = c(NA,
-9L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 9 × 7
year code `1` `2` `3` `4` totaal
<chr> <dbl> <int> <int> <int> <int> <dbl>
1 2006 0 1 0 4 4 11
2 2006 1110 8 10 2 6 26
3 2006 1120 2 0 1 0 3
4 2006 1130 2 0 2 3 7
5 2006 1220 0 0 0 1 1
6 2006 1230 2 2 4 3 9
7 2006 1310 0 1 0 0 7
8 2006 1320 1 3 0 0 7
9 2006 1330 0 1 3 3 6
df_II <- structure(list(year = c("2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006", "2006", "2006"), code = c(0, 1110,
1120, 1130, 1210, 1220, 1230, 1310, 1320, 1330), `1` = c(15806L,
655L, 105L, 328L, 138L, 452L, 445L, 471L, 672L, 615L), `2` = c(9681L,
337L, 68L, 215L, 97L, 357L, 366L, 245L, 440L, 360L), `3` = c(10457L,
221L, 40L, 123L, 65L, 325L, 322L, 151L, 352L, 332L), `4` = c(7109L,
128L, 5L, 64L, 56L, 256L, 240L, 83L, 274L, 192L), totaal = c(43053,
1341, 218, 730, 356, 1390, 1373, 950, 1738, 1499)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 10 × 7
year code `1` `2` `3` `4` totaal
<chr> <dbl> <int> <int> <int> <int> <dbl>
1 2006 0 15806 9681 10457 7109 43053
2 2006 1110 655 337 221 128 1341
3 2006 1120 105 68 40 5 218
4 2006 1130 328 215 123 64 730
5 2006 1210 138 97 65 56 356
6 2006 1220 452 357 325 256 1390
7 2006 1230 445 366 322 240 1373
8 2006 1310 471 245 151 83 950
9 2006 1320 672 440 352 274 1738
10 2006 1330 615 360 332 192 1499
I would like to create a new data.frame df_out, which divides df_I by df_II, for columns 1,2,3,4, totaal by year and code. The issue is that not every code is available for each year.
What is the best way to divide this unequal dataframe?
Desired outcome:
# A tibble: 10 × 7
year code `1` `2` `3` `4` totaal
<chr> <dbl> <int> <int> <int> <int> <dbl>
1 2006 0 1 /15806 0/9681 4/10457 4/7109 11/43053
You could subset the second data frame using %in%, assuming both code columns are properly ordered.
cols <- as.character(1:4)
cbind(df_I[setdiff(names(df_I), cols)], df_I[cols] / subset(df_II, code %in% df_I$code, cols))
# year code totaal 1 2 3 4
# 1 2006 0 11 6.326711e-05 0.000000000 0.0003825189 0.000562667
# 2 2006 1110 26 1.221374e-02 0.029673591 0.0090497738 0.046875000
# 3 2006 1120 3 1.904762e-02 0.000000000 0.0250000000 0.000000000
# 4 2006 1130 7 6.097561e-03 0.000000000 0.0162601626 0.046875000
# 5 2006 1220 1 0.000000e+00 0.000000000 0.0000000000 0.003906250
# 6 2006 1230 9 4.494382e-03 0.005464481 0.0124223602 0.012500000
# 7 2006 1310 7 0.000000e+00 0.004081633 0.0000000000 0.000000000
# 8 2006 1320 7 1.488095e-03 0.006818182 0.0000000000 0.000000000
# 9 2006 1330 6 0.000000e+00 0.002777778 0.0090361446 0.015625000
You could use complete to make the number of rows between the two data frames equal, and then do the division:
library(tidyr)
df_I %<>%
complete(code = df_II$code) %>%
fill(year) %>%
replace(is.na(.), 0)
cbind(df_I[c(1, 2)], df_I[-c(1, 2)] / df_II[-c(1, 2)])
code year `1` `2` `3` `4` totaal
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 2006 0.0000633 0 0.000383 0.000563 0.000255
2 1110 2006 0.0122 0.0297 0.00905 0.0469 0.0194
3 1120 2006 0.0190 0 0.025 0 0.0138
4 1130 2006 0.00610 0 0.0163 0.0469 0.00959
5 1210 2006 0 0 0 0 0
6 1220 2006 0 0 0 0.00391 0.000719
7 1230 2006 0.00449 0.00546 0.0124 0.0125 0.00655
8 1310 2006 0 0.00408 0 0 0.00737
9 1320 2006 0.00149 0.00682 0 0 0.00403
10 1330 2006 0 0.00278 0.00904 0.0156 0.00400

How to calculate the sum of periods over each column for each row in R

I would like to calculate the sum of each flower in each year in R. Below is an example of how the table looks (Table 1) and what I want the outcome to be (Table 2). I know how to do the code calculation in a long table format but I am not sure how to do it in a wide table format. Note: I am using package: dplyr
(Table 1)
flower
1902
1950
2010
2012
2021
lily
23
0
0
8
5
rose
50
60
5
16
0
daisy
30
7
10
2
0
I need to calculate the sum for each flower in each year. The end result should give me:
(Table 2)
flower
1902
1950
2010
2012
2021
lily
23
23
23
31
36
rose
50
110
115
131
131
daisy
30
37
47
49
49
One option involving dplyr and purrr might be:
dat %>%
mutate(pmap_dfr(across(-1), ~ cumsum(c(...))))
flower X1902 X1950 X2010 X2012 X2021
1 lily 23 23 23 31 36
2 rose 50 110 115 131 131
3 daisy 30 37 47 49 49
Using rowCumsums from matrixStats
library(matrixStats)
df1[-1] <- rowCumsums(as.matrix(df1[-1]))
-output
df1
flower X1902 X1950 X2010 X2012 X2021
1 lily 23 23 23 31 36
2 rose 50 110 115 131 131
3 daisy 30 37 47 49 49
Here is one way of getting your expected result:
Your data frame :
dat <- structure(list(flower = c("lily", "rose", "daisy"), X1902 = c(23L,
50L, 30L), X1950 = c(0L, 60L, 7L), X2010 = c(0L, 5L, 10L), X2012 = c(8L,
16L, 2L), X2021 = c(5L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-3L))
Apply a function that calculate the cumulative sums and apply to each row of the data at column 2 to 6:
dat[1:nrow(dat), 2:6] <- t(apply(dat[1:nrow(dat), 2:6], 1, function(x) cumsum(c(x))))
# The result
dat
flower X1902 X1950 X2010 X2012 X2021
1 lily 23 23 23 31 36
2 rose 50 110 115 131 131
3 daisy 30 37 47 49 49
#benson23 has kindly suggested the following simpler code to get the same result:
dat[, 2:6] <- t(apply(dat[,2:6], 1, cumsum))
flower X1902 X1950 X2010 X2012 X2021
1 lily 23 23 23 31 36
2 rose 50 110 115 131 131
3 daisy 30 37 47 49 49
You can use apply with cumsum, plus a little bit of re-formatting.
setNames(as.data.frame(cbind(df[, 1], t(apply(df[, -1], 1, cumsum)))), colnames(df))
flower X1902 X1950 X2010 X2012 X2021
1 lily 23 23 23 31 36
2 rose 50 110 115 131 131
3 daisy 30 37 47 49 49
Data
df <- structure(list(flower = c("lily", "rose", "daisy"), X1902 = c(23L,
50L, 30L), X1950 = c(0L, 60L, 7L), X2010 = c(0L, 5L, 10L), X2012 = c(8L,
16L, 2L), X2021 = c(5L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-3L))
Here is an alternative using pivoting:
library(dplyr)
library(tidyr)
dat %>%
pivot_longer(-flower) %>%
group_by(flower) %>%
mutate(value = cumsum(value)) %>%
pivot_wider() %>%
ungroup()
flower X1902 X1950 X2010 X2012 X2021
<chr> <int> <int> <int> <int> <int>
1 lily 23 23 23 31 36
2 rose 50 110 115 131 131
3 daisy 30 37 47 49 49

Split data frame by class regarding to OID

I try to split dataframe by 50% by class. However, I do not want to split fields with the same OID (object identifier). I would like the fields with the same OID to be in the same set.
#Data frame:
"b1""b2""b3""CLASS" "OID"
110 134 119 "tree" 1
112 133 118 "tree" 1
105 125 110 "tree" 2
112 132 117 "tree" 2
109 125 115 "meadow" 6
93 110 101 "meadow" 6
86 106 95 "meadow" 7
105 136 116 "meadow" 7
102 128 111 "meadow" 8
108 129 115 "meadow" 8
113 134 119 "meadow" 8
Expected data:
#Expected:
"b1""b2""b3""CLASS" "OID"
110 134 119 "tree" 1
112 133 118 "tree" 1
109 125 115 "meadow" 6
93 110 101 "meadow" 6
86 106 95 "meadow" 7
105 136 116 "meadow" 7
This selects the top half of rows in each group, plus any rows which have the same OID as the rows in that top half.
library(dplyr)
df %>%
group_by(CLASS) %>%
filter(OID %in% head(OID, n() %/% 2)) %>%
ungroup
# # A tibble: 6 x 5
# b1 b2 b3 CLASS OID
# <int> <int> <int> <chr> <int>
# 1 110 134 119 tree 1
# 2 112 133 118 tree 1
# 3 109 125 115 meadow 6
# 4 93 110 101 meadow 6
# 5 86 106 95 meadow 7
# 6 105 136 116 meadow 7
If your real data is arranged by OID like this example, you could also use top_frac
df %>%
group_by(CLASS) %>%
top_frac(.5, -OID)
# # A tibble: 6 x 5
# b1 b2 b3 CLASS OID
# <int> <int> <int> <chr> <int>
# 1 110 134 119 tree 1
# 2 112 133 118 tree 1
# 3 109 125 115 meadow 6
# 4 93 110 101 meadow 6
# 5 86 106 95 meadow 7
# 6 105 136 116 meadow 7
Your data:
df = structure(list(b1 = c(110L, 112L, 105L, 112L, 109L, 93L, 86L,
105L, 102L, 108L, 113L), b2 = c(134L, 133L, 125L, 132L, 125L,
110L, 106L, 136L, 128L, 129L, 134L), b3 = c(119L, 118L, 110L,
117L, 115L, 101L, 95L, 116L, 111L, 115L, 119L), CLASS = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("meadow",
"tree"), class = "factor"), OID = c(1L, 1L, 2L, 2L, 6L, 6L, 7L,
7L, 8L, 8L, 8L)), class = "data.frame", row.names = c(NA, -11L
))
First create a function to take 1/2 according to OID
func = function(x){
x[x$OID %in% x$OID[1:round(nrow(x)/2)],]
}
We randomize the way the OID are sorted
df$OID = factor(df$OID,levels=sample(unique(df$OID)))
df = df[order(df$OID),]
do.call(rbind,by(df,df$CLASS,func))
This will ensure you get random ~ 50% everytime, with complete OID

Change data set from wide to long while retaining group id, and also gathering columns [duplicate]

This question already has answers here:
Reshaping multiple sets of measurement columns (wide format) into single columns (long format)
(8 answers)
Closed 5 years ago.
I'd really appreciate some help getting this messy set of new survey data into a usable form. It was collected in a strange way and now I've got strange data to work with. I've looked through tidyr and used those approaches to no end. I suspect my problem is that I'm thinking about this dataset all wrong and I'm blind to some real answer. But given all the things I need to do to this df, I cant figure out where to start and thus where to start googling.
What I need:
For each person to be their own row
Each person retains their GroupID and Treated value
For the variables currently attached to each person individually to become columns (age, weight, height)
Fake (and much smaller):
structure(list(GroupID = 1:5, Treated = c("Y", "Y", "N", "Y",
"N"), person1_age = c(45L, 33L, 71L, 19L, 52L), person1_weight = c(187L,
145L, 136L, 201L, 168L), person1_height = c(69L, 64L, 51L, 70L,
66L), person2_age = c(54L, 20L, 48L, 63L, 26L), person2_weight = c(140L,
122L, 186L, 160L, 232L), person2_height = c(62L, 70L, 65L, 72L,
74L), person3_age = c(21L, 56L, 40L, 59L, 67L), person3_weight = c(112L,
143L, 187L, 194L, 159L), person3_height = c(61L, 69L, 73L, 63L,
72L)), .Names = c("GroupID", "Treated", "person1_age", "person1_weight",
"person1_height", "person2_age", "person2_weight", "person2_height",
"person3_age", "person3_weight", "person3_height"), row.names = c(NA,
5L), class = "data.frame")
Any help or further readings you could point me to would be very much appreciated.
reshape can do this, with the appropriate arguments:
> reshape(x, direction="long", varying=names(x)[3:11], timevar='person', v.names=c('height', 'age', 'weight'), sep='_')
GroupID Treated person height age weight id
1.1 1 Y 1 187 45 69 1
2.1 2 Y 1 145 33 64 2
3.1 3 N 1 136 71 51 3
4.1 4 Y 1 201 19 70 4
5.1 5 N 1 168 52 66 5
1.2 1 Y 2 140 54 62 1
2.2 2 Y 2 122 20 70 2
3.2 3 N 2 186 48 65 3
4.2 4 Y 2 160 63 72 4
5.2 5 N 2 232 26 74 5
1.3 1 Y 3 112 21 61 1
2.3 2 Y 3 143 56 69 2
3.3 3 N 3 187 40 73 3
4.3 4 Y 3 194 59 63 4
5.3 5 N 3 159 67 72 5
This relies on the order of the columns in your original data, for the varying argument, being in increasing order in the original data.
If that's not the case, specify varying manually. Here's what is used above:
> names(x)[3:11]
[1] "person1_age" "person1_weight" "person1_height" "person2_age" "person2_weight" "person2_height"
[7] "person3_age" "person3_weight" "person3_height"
We can also use melt from data.table which can take multiple patterns in the measure argument
library(data.table)
melt(setDT(x), measure = patterns("age$", "weight$", "height$"),
variable.name = "person", value.name = c("age", "weight", "height"))
# GroupID Treated person age weight height
# 1: 1 Y 1 45 187 69
# 2: 2 Y 1 33 145 64
# 3: 3 N 1 71 136 51
# 4: 4 Y 1 19 201 70
# 5: 5 N 1 52 168 66
# 6: 1 Y 2 54 140 62
# 7: 2 Y 2 20 122 70
# 8: 3 N 2 48 186 65
# 9: 4 Y 2 63 160 72
#10: 5 N 2 26 232 74
#11: 1 Y 3 21 112 61
#12: 2 Y 3 56 143 69
#13: 3 N 3 40 187 73
#14: 4 Y 3 59 194 63
#15: 5 N 3 67 159 72

R- create new columns based on levels of a freq table variable

Hi I am new to R so please bear with me,
I have my data arranged like so,
Length Seq X
28 GTGCACCGCAAGTGCTTCTAAGAAGGAT 19
28 TGCACCGCAAGTGCTTCTAAGAAGGATC 18
29 GTGCACCGCAAGTGCTTCTAAGAAGGATC 19
29 GTGCACCGCAAGTGCTTCTAAGAAGGATC 19
and I used
count(dF, vars=c("Length", "X"))
to generate a freq table that looks like:
Length X freq
28 15 160
28 16 163
28 17 21
29 15 198
29 16 410
29 17 104
How can I rearrange the data so that it looks something like this?
Length 15 16 17 total
28 160 163 21 344
29 198 410 104 712
30 205 614 393 1212
Tot 2746 6564 2012 11322
(I know these values are wrong)
If you want it to look like your example:
# your data
df<- data.frame(Length = c(28, 28, 28, 29, 29, 29),
X = c(15, 16, 17, 15, 16, 17),
freq = c(160, 163, 21, 198, 410, 104))
use this function
require(reshape)
tabler <- function(a){
b <- cast(a, Length~X)
b <- cbind(b, rowSums(b))
b <- rbind(b, colSums(b))
colnames(b)[ncol(b)] <- b[nrow(b),1] <- "total"
return(b)
}
tabler(df)
returns:
Length 15 16 17 total
1 28 160 163 21 344
2 29 198 410 104 712
3 total 358 573 125 1056
A base R option is
addmargins(xtabs(freq~Length+X, df1))
# X
#Length 15 16 17 Sum
# 28 160 163 21 344
# 29 198 410 104 712
# Sum 358 573 125 1056
data
df1 <- structure(list(Length = c(28L, 28L, 28L, 29L, 29L, 29L),
X = c(15L,
16L, 17L, 15L, 16L, 17L), freq = c(160L, 163L, 21L, 198L, 410L,
104L)), .Names = c("Length", "X", "freq"), class = "data.frame",
row.names = c(NA, -6L))

Resources