I have this data frame-
input_output <- data.frame(ip_op = c('input_0', 'input_2', 'input_9', 'output_1', 'output_2', 'output_3'), a = c(1,32, 12, 246, 901, 837), b = c(284, 23, 19, 284, 9, 12), c = c(12, 8940, 379, 490, 0, 12))
ip_op a b c
1 input_0 1 284 12
2 input_2 32 23 8940
3 input_9 12 19 379
4 output_1 246 284 490
5 output_2 901 9 0
6 output_3 837 12 12
I want to create the following data frame-
input_output
type input output
1 a 45 1984
2 b 326 305
3 c 9331 502
I have tried using transpose but the column names become rownames. How do I transform this data frame?
Does this work:
> library(dplyr)
> library(tidyr)
> input_output %>% pivot_longer(-ip_op) %>% mutate(ip_op = str_extract(ip_op, ('input|output'))) %>% group_by(ip_op, name) %>% summarise(value = sum(value)) %>%
+ pivot_wider(names_from = ip_op, values_from = value) %>% rename(type = name)
`summarise()` regrouping output by 'ip_op' (override with `.groups` argument)
# A tibble: 3 x 3
type input output
<chr> <dbl> <dbl>
1 a 45 1984
2 b 326 305
3 c 9331 502
>
Try reshaping data to long, then separate the variable name to keep the desired suffix. Aggregate the values with respective groups and then reshape to wide. Here the code using tidyverse functions:
library(tidyverse)
#Code
new <- input_output %>% pivot_longer(-1) %>%
separate(ip_op,c('Var1','Var2'),sep='_') %>%
select(-Var2) %>% group_by(Var1,name) %>%
summarise(value=sum(value,na.rm = T)) %>%
pivot_wider(names_from = Var1,values_from=value)
Output:
# A tibble: 3 x 3
name input output
<chr> <dbl> <dbl>
1 a 45 1984
2 b 326 305
3 c 9331 502
Here is a dplyr and tidyr solution.
library(dplyr)
library(tidyr)
input_output %>%
mutate(ip_op = sub("_.*$", "", ip_op)) %>%
group_by(ip_op) %>%
summarise(across(a:c, ~sum(.x, na.rm = TRUE)), .groups = "keep") %>%
pivot_longer(
cols = a:c,
names_to = 'type',
values_to = 'value'
) %>%
pivot_wider(
id_cols = type,
names_from = ip_op,
values_from = value
)
## A tibble: 3 x 3
# type input output
# <chr> <dbl> <dbl>
#1 a 45 1984
#2 b 326 305
#3 c 9331 502
Here is a base R option using aggregate + reshape like below
u <- aggregate(
. ~ ip_op,
transform(
input_output,
ip_op = gsub("_\\d+", "", ip_op)
),
sum
)
reshape(
cbind(stack(u[-1]), type = u$ip_op),
direction = "wide",
idvar = "ind",
timevar = "type"
)
which gives
ind values.input values.output
1 a 45 1984
3 b 326 305
5 c 9331 502
We can use data.table
library(data.table)
dcast(melt(setDT(input_output), id.var = 'ip_op',
variable.name = 'type')[, ip_op := sub("_.*", "", ip_op)],
type ~ ip_op, value.var = 'value', sum)
-output
# type input output
#1: a 45 1984
#2: b 326 305
#3: c 9331 502
Or using transpose
data.table::transpose(setDT(input_output)[, ip_op := sub("_\\d+$", "", ip_op)][,
lapply(.SD, sum), ip_op], make.names = 'ip_op', keep.names = 'type')
-output
# type input output
#1: a 45 1984
#2: b 326 305
#3: c 9331 502
Or with tidyverse and data.table::transpose
library(dplyr)
library(stringr)
input_output %>%
group_by(ip_op = str_remove(ip_op, '_\\d+$')) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop') %>%
data.table::transpose(make.names = 'ip_op', keep.names = 'type')
# type input output
#1 a 45 1984
#2 b 326 305
#3 c 9331 502
Related
I have the sales and cost data by models. The code below select TOP10 models by sales and all the rest are sum up in new category "Others" which is 11th row.
df <- data.frame (model = c("A","B","C","D","E","F","G","H","I","J","K","L","M","N"),
sale = c(100,300,140,456,345,456,456,780,40,560,560,456,350,500),
cost = c(1340,330,440,443,365,437,478,700,30,460,160,456,650,100))
#TOP10 by sale
order <- df %>%
type.convert(as.is = TRUE) %>%
mutate(pos = row_number(desc(sale)),
model = ifelse(pos>10, 'Others', model),
pos = ifelse(pos>10, 11, pos)) %>%
group_by(model, pos) %>%
summarise(cost= sum(cost), sale= sum(sale), .groups = 'drop') %>%
arrange(pos)
Output:
model pos cost sale
1 H 1 700 780
2 J 2 460 560
3 K 3 160 560
4 N 4 100 500
5 D 5 443 456
6 F 6 437 456
7 G 7 478 456
8 L 8 456 456
9 M 9 650 350
10 E 10 365 345
11 Others 11 2140 580
The sale of category A is put in "other" since it has low a sale (100) and is not in TOP10.
Now, I want to include A in this TOP 10 in any case, no matter how much sale it has. So output should be TOP9 + 'A' + 'Others':
Expected output:
model pos cost sale
1 H 1 700 780
2 J 2 460 560
3 K 3 160 560
4 N 4 100 500
5 D 5 443 456
6 F 6 437 456
7 G 7 478 456
8 L 8 456 456
9 M 9 650 350
10 A 10 100 1340
11 Others 11 2140 580
(Thus change must be done in the given code)
To be able to automate it, I created a function for you. You can easily specify the parameters and get the output quickly. Also, this function can be modified at a later stage based on your requirements:
Top10BySales = function(DataFrame,TopN=10, IncludeModels) {
## Dataframe -> User needs to specify the dataframe
## TopN -> Do you want to limit the analysis by top 10 or change it? Default value is 10
## IncludeModels -> Here you will specify which models you want to include
## Ranking the models based on sales
df1 = DataFrame %>%
arrange(desc(sale)) %>%
mutate(Ranking = 1:nrow(DataFrame),
Include = ifelse(Ranking<=TopN, model, "Other"))
## Grouping the models
df2 = df1 %>%
mutate(Ranking = ifelse(Include!="Other", Ranking, 10+1)) %>%
group_by(Include,Ranking) %>%
summarise(cost= sum(cost), sale= sum(sale), .groups = 'drop')
## Checking the length of IncludeModelsVector
if (length(IncludeModels) == 0) {
df2=df2
} else {
df3 = data.frame(ModelNames = IncludeModels)
df3$Inclusion = ifelse(df3$ModelNames %in% df1$Include, "Yes", "No")
df3 = df3 %>% filter(Inclusion=="No")
df_original = df1 %>% filter(model %in% df3$ModelNames)
df_original$Num = 1:nrow(df_original)
for (i in 1:nrow(df_original)) {
df2[nrow(df2)-df_original[i,]$Num,] = df_original[i,c(1,4,3,2)] ## Replacing the values
}
}
return(df2 %>% arrange(Ranking))
}
Using the function
To use this function, please see the picture below:
Hope this helps!
One way could be making use of bind_rows after removing the 10th line and adding only where model == A:
library(tidyverse)
#TOP10 by sale
df %>%
type.convert(as.is = TRUE) %>%
mutate(pos = row_number(desc(sale)),
model = ifelse(pos>10, 'Others', model),
pos = ifelse(pos>10, 11, pos)) %>%
group_by(model, pos) %>%
summarise(cost= sum(cost), sale= sum(sale), .groups = 'drop') %>%
arrange(pos) %>%
slice(-10) %>%
bind_rows(df %>%
filter(model == "A")) %>%
mutate(pos = replace_na(pos, 10)) %>%
arrange(pos)
model pos cost sale
<chr> <dbl> <dbl> <dbl>
1 H 1 700 780
2 J 2 460 560
3 K 3 160 560
4 N 4 100 500
5 D 5 443 456
6 F 6 437 456
7 G 7 478 456
8 L 8 456 456
9 M 9 650 350
10 A 10 1340 100
11 Others 11 2140 580
You can mutate in 2 steps, before grouping and manipulate the pos variable to fix the order. This solves the problem in the comments to the other answer.
order <- df %>%
type.convert(as.is = TRUE) %>%
mutate(pos = row_number(desc(sale))) %>%
mutate(pos = ifelse(model == "A" & pos > 10, 11, ifelse(pos > 10, 12, pos)),
model = ifelse(pos>11, 'Others', model)) %>%
group_by(model, pos) %>%
summarise(cost= sum(cost), sale= sum(sale), .groups = 'drop') %>%
arrange(pos)
Dummy data:
set.seed(4)
name <- sample(LETTERS[1:8], 500, replace = T)
id <- round(runif(500, min=1, max=200))
df <- data.frame(name, id)
I want to check the % of unique id of B which are there for other remaining name
The expected output will be something like this:
name count pct_common
<chr> <int> <dbl>
1 A 17 29.3
2 C 18 31.0
3 D 16 27.6
4 E 22 37.9
5 F 14 24.1
6 G 16 27.6
7 H 20 34.5
My approach so far:
the_name <- 'B'
#Selecting the unique name, id combination for 'B'
df %>%
filter(name %in% the_name) %>%
distinct(name, id)-> list_id
#Checking which of these ids are already there for other names and then count them.
df %>%
filter( id %in% list_id$id) %>%
filter(!name %in% the_name) %>%
group_by(name) %>%
summarise(count=n()) %>%
mutate(pct_common= count/nrow(list_id)*100)
It is getting the job done but creating a separate data frame like this doesn't seem very elegant. Also, it is taking more time for a comparatively large data frame (Millions of observations).
Is there a better way to approach this problem?
Here is another option -
library(dplyr)
df %>%
mutate(temp = n_distinct(id[name %in% the_name])) %>%
filter(id %in% unique(id[name %in% the_name]) & !name %in% the_name) %>%
group_by(name, temp) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(pct_common = count/temp * 100) %>%
select(-temp)
# name count pct_common
# <chr> <int> <dbl>
#1 A 17 29.3
#2 C 18 31.0
#3 D 16 27.6
#4 E 22 37.9
#5 F 14 24.1
#6 G 16 27.6
#7 H 20 34.5
Not better, but here another approach that I thought of, maybe it will help you
df %>%
left_join(
df %>%
filter(name == "B") %>%
mutate(B = 1,N = n_distinct(id)) %>%
select(-name) %>%
distinct()
) %>%
group_by(name) %>%
summarise(
count = sum(B,na.rm = TRUE),
N = mean(N,na.rm = TRUE)
) %>%
ungroup() %>%
mutate(pct_common= 100*count/N)
name count N pct_common
<chr> <dbl> <dbl> <dbl>
1 A 17 58 29.3
2 B 65 58 112.
3 C 18 58 31.0
4 D 16 58 27.6
5 E 22 58 37.9
6 F 14 58 24.1
7 G 16 58 27.6
8 H 20 58 34.5
We could do this in a single pipe. Create a logical vector ('i1') as column, get the number of distinct elements of 'id' based on 'i1' as 'n1', then do the filter in a single step, count and get the percentage by dividing the 'count' with 'n1'
library(dplyr)
df %>%
mutate(i1 = name %in% the_name, n1 = n_distinct(id[i1])) %>%
filter(id %in% id[i1], !i1) %>%
count(name, n1, name = 'count') %>%
mutate(pct_common= count/n1*100, n1 = NULL)
-output
name count pct_common
1 A 17 29.31034
2 C 18 31.03448
3 D 16 27.58621
4 E 22 37.93103
5 F 14 24.13793
6 G 16 27.58621
7 H 20 34.48276
NOTE: The OP asked about It is getting the job done but creating a separate data frame like this doesn't seem very elegant. Also, it is taking more time for a comparatively large data frame The above code does that in 5 steps and doesn't do the same calculation multiple times i.e. name %in% the_name
if the data is really big, can use collapse
library(collapse)
n1 <- fndistinct(df$id[df$name %in% the_name])
ss(df, id %in% id[name %in% the_name] & !name %in% the_name) %>%
fnobs(g = .$name, drop = FALSE) %>%
tfm(pct_common = 100 *name/n1) %>%
frename(name = count) %>%
tfm(id = NULL)
count pct_common
A 17 29.31034
C 18 31.03448
D 16 27.58621
E 22 37.93103
F 14 24.13793
G 16 27.58621
H 20 34.48276
This question already has answers here:
Transpose / reshape dataframe without "timevar" from long to wide format
(9 answers)
Closed 1 year ago.
How can I use tidyr pivot_wide to convert this data frame from long form to wide form? I tried applying the examples on the docs page, but I must be missing something.
Data Frame
id <- c(1,1,2,2,3,3)
filename <- c('file1a.txt', 'file1b.txt',
'file2a.txt', 'file2b.txt',
'file3a.txt', 'file3b.txt')
val <- c(832, 834, 221, 878, 2, 19)
df1 <- data.frame(id, filename, val)
view(df1)
id
filename
val
1
file1a.txt
832
1
file1b.txt
834
2
file2a.txt
221
2
file2b.txt
878
3
file3a.txt
2
3
file3b.txt
19
Desired Output
id
filename1
filename2
val1
val2
1
file1a.txt
file1b.txt
832
834
2
file2a.txt
file2b.txt
221
878
3
file3a.txt
file3b.txt
2
19
Failed Attempts
df_wide <- pivot_wider(data = df1,
id_cols = id,
values_from = c("filename", "val"))
view(df_wide)
id
filename_
val_
1
1:2
c(832,834)
2
3:4
c(221,878)
3
5:6
c(2,19)
df_wide <- pivot_wider(data = df1,
id_cols = id,
names_from = c("filename", "val"),
values_from = c("filename", "val"))
view(df_wide)
id
filename_file1a.txt_832
filename_file1b.txt_834
filename_file2a.txt_221
...etc
1
file1a.txt
file1b.txt
NA
...etc
2
NA
NA
file2a.txt
...etc
3
NA
NA
NA
...etc
We need a row sequence
library(dplyr)
library(tidyr)
library(data.table)
df1 %>%
mutate(cn = rowid(id)) %>%
pivot_wider(names_from = cn, values_from = c(filename, val), names_sep="")
-output
# A tibble: 3 x 5
# id filename1 filename2 val1 val2
# <dbl> <chr> <chr> <dbl> <dbl>
#1 1 file1a.txt file1b.txt 832 834
#2 2 file2a.txt file2b.txt 221 878
#3 3 file3a.txt file3b.txt 2 19
Or do a group by row_number
df1 %>%
group_by(id)
mutate(cn = row_number()) %>%
pivot_wider(names_from = cn, values_from = c(filename, val), names_sep="")
If we need not use %>%, specify the data as the mutated original dataset, with an added column 'cn' based on the sequence of 'id'
pivot_wider(mutate(df1, cn = rowid(id)),
names_from = cn, values_from = c(filename, val), names_sep="")
A data.table option with dcast
> dcast(setDT(df1), id ~ rowid(id), value.var = c("filename", "val"))
id filename_1 filename_2 val_1 val_2
1: 1 file1a.txt file1b.txt 832 834
2: 2 file2a.txt file2b.txt 221 878
3: 3 file3a.txt file3b.txt 2 19
How can i use for loop to sum data by group then break and print the value accumulated sum of A and B respectively?
ie:
Type value
A 2
A NA
A 13 15
B 565
B 245
B 578 1388
library(dplyr)
df %>%
group_by(Type) %>%
mutate(cs = cumsum(value, na.rm = True))
but it only shows the whole table and originally should be sum A should be 15 but eventually become NA.
Type value cs
A 2 2
A NA NA
A 13 NA
B 565 565
B 245 810
B 578 1388
Using dplyr you can try
library(dplyr)
df %>%
group_by(Type) %>%
mutate(cs = last(sum(value, na.rm = TRUE))) %>%
mutate(id = row_number()) %>% # Creating a dummy id column
mutate(cs= replace(cs, id!= max(id),NA)) %>% # replace all rows of cs that are not the last within group Type
select(-id) # removing id column
#Output
# A tibble: 6 x 3
# Groups: Type [2]
Type value cs
<chr> <int> <int>
1 A 2 NA
2 A NA NA
3 A 13 15
4 B 565 NA
5 B 245 NA
6 B 578 1388
If I understand correctly, the OP expects that all rows of the new column cs are blank except for the last row of each group where the sum of the values belonging to the group should be printed.
A blank row is only possible if the new column cs is of type character. In case cs is expected to be of numeric type then there is no other choice to print either 0, NA, or any other numeric value, but not "" (empty string).
So, below there are suggestions to create a character column either by using
ifelse(), or
replace() and rep(), or
c() and rep().
in data.table and dplyr syntax, resp.
Note that no for loop is required at all.
data.table
library(data.table)
setDT(df)[, cs := fifelse(1:.N == .N, as.character(sum(value, na.rm = TRUE)), ""), by = Type][]
or
setDT(df)[, cs := replace(rep("", .N), .N, sum(value, na.rm = TRUE)), by = Type][]
or
setDT(df)[, cs := c(rep("", .N - 1L), sum(value, na.rm = TRUE)), by = Type][]
Type value cs
1: A 2
2: A NA
3: A 13 15
4: B 565
5: B 245
6: B 578 1388
dplyr
library(dplyr)
df %>%
group_by(Type) %>%
mutate(cs = ifelse(row_number() == n()), sum(value, na.rm = TRUE), ""))
or
df %>%
group_by(Type) %>%
mutate(cs = replace(rep("", n()), n(), sum(value, na.rm = TRUE)))
or
df %>%
group_by(Type) %>%
mutate(cs = c(rep("", n() - 1L), sum(value, na.rm = TRUE)))
# A tibble: 6 x 3
# Groups: Type [2]
Type value cs
<chr> <int> <chr>
1 A 2 ""
2 A NA ""
3 A 13 "15"
4 B 565 ""
5 B 245 ""
6 B 578 "1388"
I am using R and have the next table: (example)
ID Euros N Euros N Euros N
1 A 133.911,20 451 134.208,78 450 442,03 328
2 C 9.470,35 2856 26,18 2721 26,28 2699
My desired behaivour is that you have Euros in one line and N in other line instead of columns:
ID Var1 Var2 Var3 Var4
1 A Euros 133.911,20 134.208,78 442,03
2 A N 451 450 328
3 C Euros 9.470,35 26,18 26,28
4 C N 2856 2721 2699
I have tried to do so only with A group and using the following code:
mydatatable_wide <- spread(mydatatable, Euros, N)
But I donĀ“t get my expected result. What I get is:
ID 133.911,20 134.208,78 442,03
1 A 451 450 328
Need some work to achieve what you want - I am using dplyr & tidyr
library(dplyr)
library(tidyr)
# Here is the tribble from your question
# Note that in my language "." is decimal point and "," is thousand separate
# In R code thousand separate is not used.
df <- tribble(
~ID, ~Euros, ~N, ~Euros, ~N, ~Euros, ~N,
"A", 133911.20, 451, 134208.78, 450, 442.03, 328,
"C", 9470.35, 2856, 26.18, 2721, 26.28, 2699)
df %>%
# first convert your data set into a long version with multiple lines per ID
# contains all the numerical values Euros & N
pivot_longer(cols = where(is.numeric), names_to = "var", values_to = "value") %>%
# then split them into multiple group of Euros using group_by & group_map
group_by(var) %>%
group_map(~ {
.x %>%
group_by(ID) %>%
# in group map within each ID create a index var for those values
mutate(index_name = paste0("var_", seq(1, n(), by =1))) %>%
# then pivot them wider to have one line per ID & (Euros/N)
pivot_wider(names_from = "index_name", values_from = value, values_fill = NA)
}, .keep = TRUE) %>%
# Finally combined all the data.frame from group_map into one data.frame
bind_rows()
Output
ID var var_1 var_2 var_3
<chr> <chr> <dbl> <dbl> <dbl>
1 A Euros 133911. 134209. 442.
2 C Euros 9470. 26.2 26.3
3 A N 451 450 328
4 C N 2856 2721 2699