add recursive number with condition in dataframe R - r

i have problem with add some records table with particular condition.
for example, i have this kind of table
id word count
1 1 aa 2
2 2 bb 3
then, i want to change and add some number in id column with similar data for other column like this
id word count
1 100 aa 2
2 101 aa 2
3 102 aa 2
4 103 aa 2
5 200 bb 3
6 201 bb 3
7 202 bb 3
8 203 bb 3
the id column need to add with 2 digits in behind and then add recursive number after without changing other column data. Supposed that i have thousand records, i wonder how to make this happen.

It is not entirely clear from the description. Based on the expected output, an option is to create a list column by looping over the 'id', get the sequence after multiplying by '4' and then unnest the list column
library(dplyr)
library(purrr)
library(tidyr)
df1 %>%
mutate(id = map(id*100, seq, length.out = 4)) %>%
unnest(c(id))
# A tibble: 8 x 3
# id word count
# <dbl> <chr> <int>
#1 100 aa 2
#2 101 aa 2
#3 102 aa 2
#4 103 aa 2
#5 200 bb 3
#6 201 bb 3
#7 202 bb 3
#8 203 bb 3
Or another option is to replicate the rows (uncount), grouped by 'word', modify the 'id'
df1 %>%
uncount(4) %>%
group_by(word) %>%
mutate(id = seq(100 * first(id), length.out = n()))
data
df1 <- structure(list(id = 1:2, word = c("aa", "bb"), count = 2:3),
class = "data.frame", row.names = c("1",
"2"))

Try the following base R function.
It loops (lapply) over column 'id' creating a vector like the one in the question and then putting the other columns in order in a data.frame, then combines (rbind) all these df's into the return value.
fun <- function(x, n = 3){
cols <- grep('id', names(x), invert = TRUE)
out <- lapply(x[['id']], function(i){
y <- sprintf(paste0(i, "%02d"), c(0L, seq.int(n)))
y <- data.frame(id = y)
for(j in cols) y[[j]] <- x[i, j]
y
})
out <- do.call(rbind, out)
row.names(out) <- NULL
out
}
fun(df1)
# id V2 V3
#1 100 aa 2
#2 101 aa 2
#3 102 aa 2
#4 103 aa 2
#5 200 bb 3
#6 201 bb 3
#7 202 bb 3
#8 203 bb 3
Data
df1 <- read.table(text = "
id word count
1 1 aa 2
2 2 bb 3
", header = TRUE)

Related

count the number of columns for each row by condition on character and missing

I want to count the number of columns for each row by condition on character and missing.
For example, I have this dataset, test.
I want to create num columns, counting the number of columns 'not' in missing or empty value.
a<-c("aa","bb","cc","dd","",NA)
b<-c("",NA,"aa","","","dd")
c<-c("aa","",NA,NA,"cc","dd")
d<-c("aa","bb","",NA,"cc","dd")
test<-data.frame(cbind(a,b,c,d))
a b c d
1 aa aa aa
2 bb <NA> bb
3 cc aa <NA>
4 dd <NA> <NA>
5 cc cc
6 <NA> dd dd dd
I want to count the number of columns containing NA and empty value like
a b c d num
1 aa aa aa 3
2 bb <NA> bb 2
3 cc aa <NA> 2
4 dd <NA> <NA> 1
5 cc cc 2
6 <NA> dd dd dd 3
I tried some approach in other posts, like rowSums
Count number of columns by a condition (>) for each row
> test$num<-rowSums(test!=c("",NA),na.rm=T)
> test
a b c d num
1 aa aa aa 3
2 bb <NA> bb 0
3 cc aa <NA> 2
4 dd <NA> <NA> 0
5 cc cc 2
6 <NA> dd dd dd 0
However, it returns wrong numbers, and I couldn't find the reasons.
Would you let me know how to solve this problem?
You can use nchar + rowSums
test$num <- rowSums(nchar(as.matrix(test))>1,na.rm = TRUE)
or %in% + rowSums
test$num <- rowSums(`dim<-`(!as.matrix(test) %in% c("",NA),dim(test)))
such that
> test
a b c d num
1 aa aa aa 3
2 bb <NA> bb 2
3 cc aa <NA> 2
4 dd <NA> <NA> 1
5 cc cc 2
6 <NA> dd dd dd 3
You could use rowSums to count number of NAs or empty values in each row and then subtract it from number of columns in the dataframe.
test$num <- ncol(test) - rowSums(is.na(test) | test == "")
test
# a b c d num
#1 aa aa aa 3
#2 bb <NA> bb 2
#3 cc aa <NA> 2
#4 dd <NA> <NA> 1
#5 cc cc 2
#6 <NA> dd dd dd 3
Another idea using rowSums is to replace empty with NA, i.e.
rowSums(!is.na(replace(test, test == '', NA)))
#[1] 3 2 2 1 2 3
How about this approach from the tidyverse which also tells you how many columns contain NAs or empty strings?
a<-c("aa","bb","cc","dd","",NA)
b<-c("",NA,"aa","","","dd")
c<-c("aa","",NA,NA,"cc","dd")
d<-c("aa","bb","",NA,"cc","dd")
test<-data.frame(cbind(a,b,c,d))
library(magrittr) #import the pipe operator
num_cols <- test %>%
tibble::rowid_to_column("row_id") %>% #1st add a rowid column
dplyr::group_by(row_id) %>% #split the data into single row groups (i.e.
#row vectors)
tidyr::nest() %>% #turn it into a list column called data
dplyr::mutate(num_NAs = purrr::map_dbl(data, #loop over the data column of row
#vectors using map_dbl
~sum(is.na(.))), #count the number of NAs
num_empty = purrr::map_dbl(data,
#count the empty strings
~sum(. == "", na.rm = T)),
num_values = purrr::map_dbl(data,
#count columns without NAs or
#missing values (what you asked for)
~length(.)-sum(num_NAs, num_empty))
) %>%
dplyr::ungroup() %>% #remove the grouping structure
dplyr::select(num_NAs, num_empty, num_values) #extract only the variables you need
test_v2 <- cbind(test, num_cols)
test_v2
a b c d num_NAs num_empty num_values
1 aa aa aa 0 1 3
2 bb <NA> bb 1 1 2
3 cc aa <NA> 1 1 2
4 dd <NA> <NA> 2 1 1
5 cc cc 0 2 2
6 <NA> dd dd dd 1 0 3

looping and condition on string in r

I would like to create a new column based condition below:
if the `str` column only contains `A` then insert `A`
if the `str` column only contains `B` then insert `B`
if the `str` column only contains `A` and `B` then insert `AB`
df<-read.table(text="
ID str
1 A
1 A
1 AA
1 ABB
2 BA
2 BB", header=T)
ID str simplify_str
1 A A
1 A A
1 AA A
1 ABB AB
2 BA AB
2 BB B
As far as tidyverse options are concerned, you could use dplyr::case_when with stringr::str_detect
library(dplyr)
library(stringr)
df %>%
mutate(simplify_str = case_when(
str_detect(str, "^A+$") ~ "A",
str_detect(str, "^B+$") ~ "B",
TRUE ~ "AB"))
# ID str simplify_str
#1 1 A A
#2 1 A A
#3 1 AA A
#4 1 ABB AB
#5 2 BA AB
#6 2 BB B
Using your data.frame:
As <- grep("A",df$str)
Bs <- grep("B",df$str)
df$simplify_str <- ""
df$simplify_str[As] <- paste0(df$simplify_str[As],"A")
df$simplify_str[Bs] <- paste0(df$simplify_str[Bs],"B")
df
ID str simplify_str
1 1 A A
2 1 A A
3 1 AA A
4 1 ABB AB
5 2 BA AB
6 2 BB B
A general solution in base R where it splits the string and pastes together the unique characters in a sorted way.
df$simplify_str <- sapply(strsplit(as.character(df$str), ""),
function(x) paste(unique(sort(x)), collapse = ""))
df
# ID str simplify_str
#1 1 A A
#2 1 A A
#3 1 AA A
#4 1 ABB AB
#5 2 BA AB
#6 2 BB B

assign groups depending on pairwise combinations within string

I have a big df with the following structure
df <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7), name = c("aa", "ab", "ac", "aa", "aab", "aac", "aabc")), .Names = c("id", "name"), row.names = c(NA, -7L), class = "data.frame")
df
id name
1 1 aa
2 2 ab
3 3 ac
4 4 aa
5 5 aab
6 6 aac
7 7 aabc
I would like to create a new column group depending two character strings in column name (here aa, ab, ac) to achieve something like
df
id name group
1 1 aa 1
2 2 ab 2
3 3 ac 3
4 4 aa 1
5 5 aab 1
5 5 aab 2
6 6 aac 1
6 6 aac 3
7 7 aabc 1
7 7 aabc 2
7 7 aabc 3
While assigning groups for the two character strings is straight forward I struggle to find an efficient way to include the pairwise combinations of longer strings. I thought about splitting each string with nchar>2 into all the possible pairwise combinations and assign them to the respective groups but wonder if there is a better way.
Further notes
only pairwise combinations found in df (not all possible combinations)
order of the two character string does not matter (e.g. ab=ba)
only unique recombinations of longer strings (e.g. aaab is just aa and ab) d
Similar question without the recombination problem Assigning groups using grepl with multiple inputs
How about the following
# Your data
df <- structure(
list(
id = c(1, 2, 3, 4, 5, 6, 7),
name = c("aa", "ab", "ac", "aa", "aab", "aac", "aabc")),
.Names = c("id", "name"), row.names = c(NA, -7L), class = "data.frame")
# Create all possible 2char combinations from unique chars in string
group <- lapply(strsplit(df$name, ""), function(x)
unique(apply(combn(x, 2), 2, function(y) paste0(y, collapse = ""))));
# Melt and add original data
require(reshape2);
df2 <- melt(group);
df2 <- cbind.data.frame(
df2,
df[match(df2$L1, df$id), ]);
df2$group <- as.numeric(as.factor(df2$value));
df2;
# value L1 id name group
#1 aa 1 1 aa 1
#2 ab 2 2 ab 2
#3 ac 3 3 ac 3
#4 aa 4 4 aa 1
#5 aa 5 5 aab 1
#5.1 ab 5 5 aab 2
#6 aa 6 6 aac 1
#6.1 ac 6 6 aac 3
#7 aa 7 7 aabc 1
#7.1 ab 7 7 aabc 2
#7.2 ac 7 7 aabc 3
#7.3 bc 7 7 aabc 4
Explanation: strsplit splits the strings from df$name into char vectors. combn creates all 2-char combinations based on those char vectors. paste0 and unique keeps the concatenated unique 2-char combinations.
Note that this almost reproduces your example. That's because in my case, aabc also gives rise to group 4 = bc.
Update 1
You can filter entries based on a list of 2-char comparisons
# Filter entries
filter <- c("aa", "ab", "ac");
df2 <- df2[df2$value %in% filter, ]
# Clean up df2 to be consistent with OPs request
df2 <- df2[, -(1:2)];
df2;
# id name group
#1 1 aa 1
#2 2 ab 2
#3 3 ac 3
#4 4 aa 1
#5 5 aab 1
#5.1 5 aab 2
#6 6 aac 1
#6.1 6 aac 3
#7 7 aabc 1
#7.1 7 aabc 2
#7.2 7 aabc 3
Update 2
You can also create a filter dynamically, by selecting those value entries that are represented as 2-char strings in the original dataframe (in this case aa, ab and ac).
filter <- unique(unlist(group[sapply(group, function(x) length(x) == 1)]));

How to run efficient group_by statement using dplyr in R

I have a dataset with multiple duplicate IDs which have different categorical values. Following is an example data set.
suppressMessages(library(dplyr))
DUMMY_DATA <- data.frame(ID = c(11,22,22,33,33,33,44,44,55,55,55,55),
CATEGORY1 = c("E","B","C","C","C","D","A","A","B","C","E","B"),
CATEGORY2 = c ("AA","AA","BB","CC","DD","BB","AA","EE","AA","CC","BB","EE"),
stringsAsFactors = FALSE)
> DUMMY_DATA
ID CATEGORY1 CATEGORY2
1 11 E AA
2 22 B AA
3 22 C BB
4 33 C CC
5 33 C DD
6 33 D BB
7 44 A AA
8 44 A EE
9 55 B AA
10 55 C CC
11 55 E BB
12 55 B EE
I want to aggregate values of ID from another dataset which gives the rank of categorical values. AS follows.
Category_Rank1 <- data.frame(VAR = c("A","B","C","D","E"),
RANK = c(1,2,3,4,5),stringsAsFactors = FALSE
)
> Category_Rank1
VAR RANK
1 A 1
2 B 2
3 C 3
4 D 4
5 E 5
Category_Rank2 <- data.frame(VAR = c("AA","BB","CC","DD","EE"),
RANK = c(1,2,3,4,5),stringsAsFactors = FALSE
)
> Category_Rank2
VAR RANK
1 AA 1
2 BB 2
3 CC 3
4 DD 4
5 EE 5
For each group of IDs from DUMMY_DAT I want to look up the Category_Rank and then alot that category to the ID which has the best rank. Following is my solution.
hierarchyTransform <- function(x,dataset){
x <- unique(x)
dataset <- dataset%>%
filter(dataset[,1] %in% x)
dataset <- dataset%>%
filter(dataset[,2] == min(dataset[,2]))
return(dataset[1,1])
}
NEW_DATA <- DUMMY_DATA%>%
group_by(ID)%>%
summarise(CATEGORY1_CLEAN = hierarchyTransform(x=CATEGORY1,
dataset = Category_Rank1),
CATEGORY2_CLEAN = hierarchyTransform(x=CATEGORY2,
dataset = Category_Rank2))
I get the Following Result.
> NEW_DATA
# A tibble: 5 × 3
ID CATEGORY1_CLEAN CATEGORY2_CLEAN
<dbl> <chr> <chr>
1 11 E AA
2 22 B AA
3 33 C BB
4 44 A AA
5 55 B AA
This is exactly what I want but the problem is time taken for this operation. My Original Data set has around 1 million rows and when I group it based on ID I get about 200,000 groups. So the hierarchyTransform function is applied for 200,000 groups which takes about 15 mins for a single variable and I have to perform this operation for 10 other variables which increases the time. Is there any solution to reduce the time taken for this operation.
If you know the rank order of the levels of CATEGORY (which is alphabetic in your example) then you can turn CATEGORY into a factor with the levels ordered according to the desired ranking. Then sort by CATEGORY, group by ID, and take the first row for each ID.
DUMMY_DATA$CATEGORY = factor(DUMMY_DATA$CATEGORY, levels=LETTERS[1:5], ordered=TRUE)
DUMMY_DATA %>%
arrange(ID, CATEGORY) %>%
group_by(ID) %>%
slice(1)
ID CATEGORY
1 11 E
2 22 B
3 33 C
4 44 A
5 55 B
UPDATE: To respond to your comment and updated question: The code below will, for each ID, select the value of highest rank from each category column.
DUMMY_DATA$CATEGORY1 = factor(DUMMY_DATA$CATEGORY1, levels=LETTERS[1:5], ordered=TRUE)
DUMMY_DATA$CATEGORY2 = factor(DUMMY_DATA$CATEGORY2, levels=c("AA","BB","CC","DD","EE"), ordered=TRUE)
Now you can do either of the following:
DUMMY_DATA %>% group_by(ID) %>%
summarise(CATEGORY1 = min(CATEGORY1),
CATEGORY2 = min(CATEGORY2))
DUMMY_DATA %>% group_by(ID) %>%
summarise_all(funs(min))
ID CATEGORY1 CATEGORY2
1 11 E AA
2 22 B AA
3 33 C BB
4 44 A AA
5 55 B AA

Sum columns for the new data frame with conditions in R

I've already asked similar quistion, but I stated it incorrectly, so answers doesn't help me. Again, I have two data sets, The first one is like this:
df1 <- data.frame(id=c(111,111,111,222,222,333,333,333,333),
type=c("a","b","a","d","b","c","c","b","b"),
var=c(1,0,1,0,1,1,1,1,1))
df1
id type var
1 111 a 1
2 111 b 0
3 111 a 1
4 222 d 0
5 222 b 1
6 333 c 1
7 333 c 1
8 333 b 1
9 333 b 1
The second is like this:
df2
id A B
1 111
2 222
3 333
I need to fill the empty cells in such a way that A is the sum of var with type a or b, B is the sum of var with type c or d for each id. The result should be like this:
df2
id A B
1 111 2 0
2 222 1 0
3 333 2 2
It's important to fill this very data frame (df2), not create the new one
It's really just aggregation plus reshaping to wide form:
library(tidyverse)
# set grouping, edit var to A/B form
df1 %>% group_by(id, type = ifelse(type %in% c('a', 'b'), 'A', 'B')) %>%
summarise(var = sum(var)) %>%
spread(type, var, fill = 0) # reshape to wide
## Source: local data frame [3 x 3]
## Groups: id [3]
##
## id A B
## * <dbl> <dbl> <dbl>
## 1 111 2 0
## 2 222 1 0
## 3 333 2 2
You could create A and B in summarise if you subset var, but the code is more repetitive.
In base R,
df2 <- df1
df2$type <- ifelse(df2$type %in% c('a', 'b'), 'A', 'B')
df2 <- aggregate(var ~ id + type, df2, sum)
df2 <- reshape(df2, timevar = 'type', direction = 'wide')
df2[is.na(df2)] <- 0L
names(df2) <- sub('var\\.', '', names(df2))
df2
## id A B
## 1 111 2 0
## 2 222 1 0
## 3 333 2 2
We can do this in a single line in base R (without using any external packages)
transform(as.data.frame.matrix(xtabs(var~id+type, df1)), A= a+b, B = c+d)[-(1:4)]
# A B
#111 2 0
#222 1 0
#333 2 2

Resources