r add a new column with conditions from another - r

I have the following table
Type Score
B 18
A 23
A 45
B 877
A 654
B 345
A 23445
A 45
A 432
B 22
B 4566
B 2
B 346
A 889
I would like to be able to create a column that takes out the A values, see below
Type Score New_Score
B 18 18
A 23 0
A 45 0
B 877 877
A 654 0
B 345 345
A 23445 0
A 45 0
A 432 0
B 22 22
B 4566 4566
B 2 2
B 346 346
A 889 0
I have tried a good few things in r but none of them work for me, any help would be most appreciated.

use this
df$New_score <- replace(df$Score, df$Type == 'B', 0)
Check
df <- read.table(text = 'Type Score
B 18
A 23
A 45
B 877
A 654
B 345
A 23445
A 45
A 432
B 22
B 4566
B 2
B 346
A 889', header = T)
df$New_score <- replace(df$Score, df$Type == 'B', 0)
df
Type Score New_Score
1 B 18 18
2 A 23 0
3 A 45 0
4 B 877 877
5 A 654 0
6 B 345 345
7 A 23445 0
8 A 45 0
9 A 432 0
10 B 22 22
11 B 4566 4566
12 B 2 2
13 B 346 346
14 A 889 0

Using ifelse.
transform(dat, new_score=ifelse(Type == "B", Score, 0))
# Type Score new_score
# 1 B 18 18
# 2 A 23 0
# 3 A 45 0
# 4 B 877 877
# 5 A 654 0
# 6 B 345 345
# 7 A 23445 0
# 8 A 45 0
# 9 A 432 0
# 10 B 22 22
# 11 B 4566 4566
# 12 B 2 2
# 13 B 346 346
# 14 A 889 0

use of dplyr::mutate and case_when should solve the problem, I would think.
library(dplyr)
df <- data.frame(Type=c("B","A","C","D","A","B","A"), Score = c(1,2,3,4,5,6,7))
df_new <- df %>% mutate(New_Score = dplyr::case_when (
df$Type == "A" ~ as.numeric(0),
TRUE ~ df$Score
)#end of case_when
)#end of mutate
df_new

Just for fun. Here is another solution
df$New_Score <- df$Score # add New_Score column
df$New_Score1 <- df$New_Score[df$Type == "A"] <- 0 # add 0 with helping column
df = subset(df, select = -(New_Score1)) # remove helping column
Output:
Type Score New_Score
1 B 18 18
2 A 23 0
3 A 45 0
4 B 877 877
5 A 654 0
6 B 345 345
7 A 23445 0
8 A 45 0
9 A 432 0
10 B 22 22
11 B 4566 4566
12 B 2 2
13 B 346 346
14 A 889 0
data:
structure(list(Type = c("B", "A", "A", "B", "A", "B", "A", "A",
"A", "B", "B", "B", "B", "A"), Score = c(18, 23, 45, 877, 654,
345, 23445, 45, 432, 22, 4566, 2, 346, 889), New_Score = c(18,
0, 0, 877, 0, 345, 0, 0, 0, 22, 4566, 2, 346, 0)), row.names = c(NA,
-14L), class = c("tbl_df", "tbl", "data.frame"))

We can use
dat$new_score <- ifelse(dat$Type == "B", dat$Score, 0)

Related

How to convert data into transactional format in R

I have the following dataset and I want to transform it into a transactional format.
sample_data<-data.frame(id=c(452,125,288,496,785,328,712,647),a=c(5,8,7,9,0,0,4,0),b=c(0,7,8,9,3,6,0,0),c=c(7,8,9,0,0,0,0,7),d=c(8,7,5,0,0,0,0,7))
sample_data
sample_data
id a b c d
452 5 0 7 8
125 8 7 8 7
288 7 8 9 5
496 9 9 0 0
785 0 3 0 0
328 0 6 0 0
712 4 0 0 0
647 0 0 7 7
The desired output is as follows:
id item
452 a c d
125 a b c d
288 a b c d
496 a b
785 b
328 b
712 a
647 c d
How can I achieve this in R?
Is there an easier way of doing this?
Here is a tidyverse solution using pivot_longer, filter, and summarize.
library(dplyr)
library(stringr)
library(tidyr)
sample_data %>%
pivot_longer(a:d, names_to = "item") %>%
filter(value != 0) %>%
group_by(id) %>%
summarize(item = str_c(item, collapse = " "))
# A tibble: 8 x 2
id item
<dbl> <chr>
1 125 a b c d
2 288 a b c d
3 328 b
4 452 a c d
5 496 a b
6 647 c d
7 712 a
8 785 b
We can use apply to loop over the rows, get the names of the data where the value of numeric columns are not 0, and paste them together, then cbind with the first column of the data
cbind(sample_data[1], item = apply(sample_data[-1], 1,
function(x) paste(names(x)[x != 0], collapse = ' ')))
-output
# id item
#1 452 a c d
#2 125 a b c d
#3 288 a b c d
#4 496 a b
#5 785 b
#6 328 b
#7 712 a
#8 647 c d

Reodering a pivot wide table

I have the following dataframe:
df <- structure(list(rows = c(1, 2, 3, 4, 5, 6), col1 = c(122, 111,
111, 222, 212, 122), col2 = c(10101, 20202, 200022, 10201, 20022,
22222), col3 = c(11, 22, 22, 22, 11, 22)), class = "data.frame", row.names = c(NA,
-6L))
rows col1 col2 col3
1 1 122 10101 11
2 2 111 20202 22
3 3 111 200022 22
4 4 222 10201 22
5 5 212 20022 11
6 6 122 22222 22
I would like to filter the rows where at least one of the columns 2,3,4 include "1" AND "2".
The desired outcome would be:
rows col1 col2 col3
1 1 122 10101 11
4 4 222 10201 22
5 5 212 20022 11
6 6 122 22222 22
The following two are not working because they scan all the three columns together and not one by one.
df[which(apply(df[,2:4],1,function(x) any(grepl("1",x)) & any(grepl("2",x)))),]
OR
library(tidyverse)
TRIPS2_fin %>% filter_at(vars(2,3,4), any_vars(str_detect(., pattern="1|2")))
You could use :
df[apply(df[2:4], 1, function(x) any(grepl('1.*2|2.*1', x))),]
# rows col1 col2 col3
#1 1 122 10101 11
#4 4 222 10201 22
#5 5 212 20022 11
#6 6 122 22222 22
And similar using filter_at
library(dplyr)
df %>% filter_at(2:4, any_vars(grepl('1.*2|2.*1', .)))
We can vectorize it in base R
df[Reduce(`|`, lapply(df[2:4], grepl, pattern = '1.*2|2.*1')),]
# rows col1 col2 col3
#1 1 122 10101 11
#4 4 222 10201 22
#5 5 212 20022 11
#6 6 122 22222 22

Subtract value to the nearest specific string in another column

Let say I have a data df as below. In total, there are 20 rows and there are four types of strings in column string: "A", "B", "C" and "D".
no string position
1 B 650
2 C 651
3 B 659
4 C 660
5 C 662
6 B 663
7 D 668
8 D 670
9 C 671
10 B 672
11 C 673
12 A 681
13 C 682
14 B 683
15 C 684
16 D 690
17 A 692
18 C 693
19 D 694
20 C 695
By performing subtraction of value in column position from the previous row, I could get a forth column distance by executing the following command:
df$distance <- ave(df$position, FUN=function(x) c(0, diff(x)))
So that I could get distance from the current value to the previous row as below:
no string position distance
1 B 650 0
2 C 651 1
3 B 659 8
4 C 660 1
5 C 662 2
6 B 663 1
7 D 668 5
8 D 670 2
9 C 671 1
10 B 672 1
11 C 673 1
12 A 681 8
13 C 682 1
14 B 683 1
15 C 684 1
16 D 690 6
17 A 692 2
18 C 693 1
19 D 694 1
20 C 695 1
However, what I wish to have is to get the distance in column position for each string to the nearest previous string "C", such as the change of 7,8 and 17 below:
no string position distance
1 B 650 0
2 C 651 1
3 B 659 8
4 C 660 1
5 C 662 2
6 B 663 1
7 D 668 6
8 D 670 8
9 C 671 1
10 B 672 1
11 C 673 1
12 A 681 8
13 C 682 1
14 B 683 1
15 C 684 1
16 D 690 6
17 A 692 8
18 C 693 1
19 D 694 1
20 C 695 1
How can I do so? By the way, can I know how I can do to get the distance from the nearest next "C" in column string as well?
Maybe not an ideal solution and there is a way to simplify this.
#Taken from your code
df$distance <- ave(df$position, FUN=function(x) c(0, diff(x)))
#logical values indicating occurrence of "C"
c_occur = df$string == "C"
#We can ignore first two values in each group since,
#First value is "C" and second value is correctly calculated from previous row
#Get the indices where we need to replace the values
inds_to_replace = which(ave(df$string, cumsum(c_occur), FUN = seq_along) > 2)
#Get the closest occurrence of "C" from the inds_to_replace
c_to_replace <- sapply(inds_to_replace, function(x) {
new_inds <- which(c_occur)
max(new_inds[(x - new_inds) > 0])
#To get distance from "nearest next "C" replace the above line with
#new_inds[which.max(x - new_inds < 0)]
})
#Replace the values
df$distance[inds_to_replace] <- df$position[inds_to_replace] -
df$position[c_to_replace]
df[inds_to_replace, ]
# no string position distance
#7 7 D 668 6
#8 8 D 670 8
#17 17 A 692 8
The following tidyverse approach reproduces your expected output.
Problem description: Calculate the difference in position of the current row with the previous string = "C" row; if there is no previous string = "C" row or the row itself has string = "C", then the distance is given by the difference in position between the current and previous row (irrespective of string).
library(tidyverse)
df %>%
mutate(nC = cumsum(string == "C")) %>%
group_by(nC) %>%
mutate(dist = cumsum(c(0, diff(position)))) %>%
ungroup() %>%
mutate(dist = if_else(dist == 0, c(0, diff(position)), dist)) %>%
select(-nC)
## A tibble: 20 x 4
# no string position dist
# <int> <fct> <int> <dbl>
# 1 1 B 650 0.
# 2 2 C 651 1.
# 3 3 B 659 8.
# 4 4 C 660 1.
# 5 5 C 662 2.
# 6 6 B 663 1.
# 7 7 D 668 6.
# 8 8 D 670 8.
# 9 9 C 671 1.
#10 10 B 672 1.
#11 11 C 673 1.
#12 12 A 681 8.
#13 13 C 682 1.
#14 14 B 683 1.
#15 15 C 684 1.
#16 16 D 690 6.
#17 17 A 692 8.
#18 18 C 693 1.
#19 19 D 694 1.
#20 20 C 695 1.
Sample data
df <- read.table(text =
"no string position
1 B 650
2 C 651
3 B 659
4 C 660
5 C 662
6 B 663
7 D 668
8 D 670
9 C 671
10 B 672
11 C 673
12 A 681
13 C 682
14 B 683
15 C 684
16 D 690
17 A 692
18 C 693
19 D 694
20 C 695", header = T)
Here is a data.table way:
dtt[, distance := c(0, diff(position))]
dtt[cumsum(string == 'C') > 0,
distance := ifelse(seq_len(.N) == 1, distance, position - position[1]),
by = cumsum(string == 'C')]
# no string position distance
# 1: 1 B 650 0
# 2: 2 C 651 1
# 3: 3 B 659 8
# 4: 4 C 660 1
# 5: 5 C 662 2
# 6: 6 B 663 1
# 7: 7 D 668 6
# 8: 8 D 670 8
# 9: 9 C 671 1
# 10: 10 B 672 1
# 11: 11 C 673 1
# 12: 12 A 681 8
# 13: 13 C 682 1
# 14: 14 B 683 1
# 15: 15 C 684 1
# 16: 16 D 690 6
# 17: 17 A 692 8
# 18: 18 C 693 1
# 19: 19 D 694 1
# 20: 20 C 695 1
Here is dtt:
structure(list(no = 1:20, string = c("B", "C", "B", "C", "C",
"B", "D", "D", "C", "B", "C", "A", "C", "B", "C", "D", "A", "C",
"D", "C"), position = c(650L, 651L, 659L, 660L, 662L, 663L, 668L,
670L, 671L, 672L, 673L, 681L, 682L, 683L, 684L, 690L, 692L, 693L,
694L, 695L)), row.names = c(NA, -20L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x1939260>)
If you want to get distance to nearest next C for non-C rows, try this:
dtt[, distance := c(0, diff(position))]
dtt[, g := rev(cumsum(rev(string == 'C')))]
dtt[g > 0, distance := ifelse(seq_len(.N) == .N, distance, abs(position - position[.N])), by = g]
dtt[, g := NULL]
# no string position distance
# 1: 1 B 650 1
# 2: 2 C 651 1
# 3: 3 B 659 1
# 4: 4 C 660 1
# 5: 5 C 662 2
# 6: 6 B 663 8
# 7: 7 D 668 3
# 8: 8 D 670 1
# 9: 9 C 671 1
# 10: 10 B 672 1
# 11: 11 C 673 1
# 12: 12 A 681 1
# 13: 13 C 682 1
# 14: 14 B 683 1
# 15: 15 C 684 1
# 16: 16 D 690 3
# 17: 17 A 692 1
# 18: 18 C 693 1
# 19: 19 D 694 1
# 20: 20 C 695 1

Creating a total row based on the values of another column

Let's consider the following example:
set.seed(5)
df <- data.frame(CATEGORY = rep(c("A", "B", "C", "D"), each = 2),
SUBCATEGORY = paste0(rep(c("A", "B", "C", "D"), each = 2), 1:2),
COUNT = sample(1:1000, size = 8, replace = TRUE),
SUBCOUNT = sample(1:200, size = 8, replace = TRUE),
stringsAsFactors = FALSE)
df$SUBCOUNT_PCT <- paste0(formatC(df$SUBCOUNT/df$COUNT * 100, digits = 2, format = 'f'), "%")
> df
CATEGORY SUBCATEGORY COUNT SUBCOUNT SUBCOUNT_PCT
1 A A1 201 192 95.52%
2 A A2 686 23 3.35%
3 B B1 917 55 6.00%
4 B B2 285 99 34.74%
5 C C1 105 64 60.95%
6 C C2 702 112 15.95%
7 D D1 528 53 10.04%
8 D D2 808 41 5.07%
I would like to create rows for CATEGORY which aggregate COUNT and SUBCOUNT as follows:
CATEGORY SUBCATEGORY COUNT SUBCOUNT SUBCOUNT_PCT
1 A TOTAL 887 215 24.24%
2 A A1 201 192 95.52%
3 A A2 686 23 3.35%
4 B TOTAL 1202 154 12.81%
5 B B1 917 55 6.00%
6 B B2 285 99 34.74%
7 C TOTAL 807 176 21.81%
8 C C1 105 64 60.95%
9 C C2 702 112 10.04%
10 D TOTAL 1336 94 7.04%
11 D D1 528 53 10.04%
12 D D2 808 41 5.07%
Is there a way to do this without having to loop through every CATEGORY?
Using dplyr to summarize data and then bind back to original data
library(dplyr)
df %>%
group_by(CATEGORY) %>%
summarize(SUBCATEGORY = "TOTAL",
COUNT = sum(COUNT),
SUBCOUNT = sum(SUBCOUNT),
SUBCOUNT_PCT = sprintf("%.2f%%", SUBCOUNT / COUNT * 100)) %>%
bind_rows(., df) %>%
arrange(CATEGORY)
# A tibble: 12 x 5
CATEGORY SUBCATEGORY COUNT SUBCOUNT SUBCOUNT_PCT
<chr> <chr> <int> <int> <chr>
1 A TOTAL 887 215 24.24%
2 A A1 201 192 95.52%
3 A A2 686 23 3.35%
4 B TOTAL 1202 154 12.81%
5 B B1 917 55 6.00%
6 B B2 285 99 34.74%
7 C TOTAL 807 176 21.81%
8 C C1 105 64 60.95%
9 C C2 702 112 15.95%
10 D TOTAL 1336 94 7.04%
11 D D1 528 53 10.04%
12 D D2 808 41 5.07%

Aggregate function to group,count and mean

I have a dataset with three variable a b and c.
a 45 345
a 45 345
a 34 234
a 35 456
b 45 123
b 65 345
b 34 456
c 23 455
c 54 567
c 34 345
c 87 567
c 67 345
I want to aggregate the data set by a and b and give count and mean. Please find the below output. Is there any function to do both together.
A B numobs c
a 34 1 234
a 35 1 456
a 45 2 345
b 34 1 456
b 45 1 123
b 65 1 345
c 23 1 455
c 34 1 345
c 54 1 567
c 67 1 345
c 87 1 567
numobs is the count and c is the mean value
We can use dplyr
library(dplyr)
df1 %>%
group_by(A, B) %>%
mutate(numbobs =n(), C= mean(C))
Or with data.table
library(data.table)
setDT(df1)[, c("numbobs", "C") := .(.N, mean(C)) , by = .(A, B)]

Resources