Insert columns by column index - r

Given the following data frame:
> header = c("A1","A2","A3","B1","B2","B3")
> df = matrix(c(0,0,0,0,0,0),nrow = 1)
> colnames(df) = header
> df
A1 A2 A3 B1 B2 B3
[1,] 0 0 0 0 0 0
I know the column index numbers of the headers containing "2" by:
> index2 = grep("2", colnames(df))
> index2
[1] 2 5
I want to add two extra columns named "A2.1","A2.2" and "B2.1", "B2.2" next to the columns with index 2 and 5, so that:
A1 A2 A2.1 A2.2 A3 B1 B2 B2.1 B2.2 B3
[1,] 0 0 0 0 0 0 0 0 0 0 0
Ho can I do this?
Many thanks in advance!

Assuming that you want to insert columns based on 'index2', one option is
df1 <- cbind(df, do.call(cbind,
replicate(2,df[,index2, drop=FALSE], simplify=FALSE)))
df2 <- df1[,order(colnames(df1)), drop=FALSE]
colnames(df2) <- make.unique(colnames(df2))
df2
# A1 A2 A2.1 A2.2 A3 B1 B2 B2.1 B2.2 B3
#[1,] 0 0 0 0 0 0 0 0 0 0

You could try something like this:
set.seed(1234)
df <- data.frame(matrix(runif(100),ncol=5))
colnames(df) <- LETTERS[1:ncol(df)]
B.1 <- runif(20)
df <- cbind(df,B.1)
df <- df[,order(colnames(df))]
#> head(df)
# A B B.1 C D E
#1 0.1137034 0.31661245 0.03545673 0.5533336 0.86483383 0.9264005
#2 0.6222994 0.30269337 0.56507611 0.6464061 0.04185728 0.4719097
#3 0.6092747 0.15904600 0.28025778 0.3118243 0.31718216 0.1426153
#4 0.6233794 0.03999592 0.20419632 0.6218192 0.01374994 0.5442698
#5 0.8609154 0.21879954 0.13373890 0.3297702 0.23902573 0.1961747
#6 0.6403106 0.81059855 0.32568192 0.5019975 0.70649462 0.8985805
It means that you are first attaching the column on the right with cbind() and order the columns sequence afterwards. Hope this helps.

Related

How to create a csv. from a table that came from a merge routine (R)? Somehow the table is not saved and I can't convert it into a dataframe

library(reshape2)
Customer<- c("Susan","Louis", "Frank","Susan")
Seller<- c("Ivan", "Donald","Chris","Ivan")
Service<-c("COU","CAR", "FCL","CAR")
Billingmean<- c(100,200,300,400)
WrsHoldSum<-c(0,0,0,0)
Group<- c("n1","n2"," "," ")
B1<- c(0,2,2,1)
B2<-c(9,8,7,6)
B3<- c(5,4,3,2)
This dataframe includes information such as Billing mean of last years
sales, Seller, Type of service
df<- data.frame(Customer, Seller,Service, Billingmean,WrsHoldSum, Group,B1,B2,B3)
This section uses dcast to change the configuration of the dframe so I can use this file later in Word to fill some Directories in the "Mail Merge" mode
sub1<- dcast(data= df, formula= Customer+Group+Seller+WrsHoldSum~Service,fun.aggregate= sum,value.var= "Billingmean")
sub2<- dcast(data= df, formula= Customer+Group+Seller+WrsHoldSum~Service,fun.aggregate= sum,value.var= "B1")
sub3<- dcast(data= df, formula= Customer+Group+Seller+WrsHoldSum~Service,fun.aggregate= sum,value.var= "B2")
sub4<- dcast(data= df, formula= Customer+Group+Seller+WrsHoldSum~Service,fun.aggregate= sum,value.var= "B3")
Here I add new columns to add the information in a table, however is not nested on the environment to call it on the write.csv() function.
tNames <- grep(x = ls(), pattern = "^sub", value = T)
lapply(seq_along(tNames), function(x){
tSym <- as.name(tNames[[x]])
d1 <- copy(eval(tSym))
cols <- grep(x = names(d1), pattern = "^CAR|^COU|^FCL", value = T)
setnames(d1, old = cols, new = paste0(cols, " B", x))
return(d1)
}) %>% Reduce(function(x, y) merge(x, y, by = c("Customer","Group","Seller","WrsHoldSum")), .)
here I don't know if there's another way to merge the new columns of Billing1 (B1), Billing 2 (B2), Billing 3(B3)..
This is the expected output
Customer Group Seller WrsHoldSum CAR B1 COU B1 FCL B1 CAR B2 COU B2 FCL B2 CAR B3 COU B3 FCL B3 CAR B4 COU B4 FCL B4
1 Frank Chris 0 0 0 300 0 0 2 0 0 7 0 0 3
2 Louis n2 Donald 0 200 0 0 2 0 0 8 0 0 4 0 0
3 Susan Ivan 0 400 0 0 1 0 0 6 0 0 2 0 0
4 Susan n1 Ivan 0 0 100 0 0 0 0 0 9 0 0 5 0

Dependents and Precedents in R

Need help in flagging number of Dependents and Precedents in R. My data frame contains some formulas (strings) and I want to add "col3" which should contain: 0 for A1, 1 for A2 (Because A2 is dependent on A1 - One dependency) and 2 for A3 (Because A3 is dependent on A2/A1).
col1 <- c('A1','A2','A3', 'A6','A4','A7')
col2 <- c('X1+Y1','A1+Y2', 'A4+Y3+A2', 'Y5+A1','A2+A1+A3','A2+A1')
df <- data.frame(col1, col2, stringsAsFactors=F)
My Output should look like:
col1 col2 col3
1 A1 Y1 0
2 A2 A1+Y2 1
3 A3 A4+Y3+A2 5
4 A6 Y5+A1 1
5 A4 A2+A1+Y3 3
6 A7 A2+A1 3
I have a data frame with 100+ rows of this format. Appreciate if you could help with this.
Below code produces the correct output.
col0 <- c('A1','A2','A3', 'A6','A4','A7')
col2 <- c('X1+Y1','A1+Y2', 'A1+Y3+A2', 'Y5+A2','A2+A1+A3','A2+A3')
df <- data.frame(col0, col2, stringsAsFactors=F)
library(tidyr)
library(dplyr)
df1 <- df %>%
separate(col2, into = as.character(c(1:4)),sep = "\\+") %>%
replace(is.na(.),"")
df1$OOE <- 0
for (i in 1:nrow(df1)) {
for (j in 2:ncol(df1)) {
for (k in 1:nrow(df1)) {
if (df1[i,j] == df1$col0[k]) df1$OOE[i]=df1$OOE[k]+df1$OOE[i]+1
}
}
}
col0 1 2 3 4 OOE
1 A1 X1 Y1 0
2 A2 A1 Y2 1
3 A3 A1 Y3 A2 3
4 A6 Y5 A2 2
5 A4 A2 A1 A3 7
6 A7 A2 A3 6
If AX can have a dependency on AY where Y>X, we need a tree like structure to find the dependencies. I knew about the igraph package but it seems to complex for the task. We just need some reference semantics and after some research, data.tree package seems appropriate. Here is the code:
col1 <- c('A1','A2','A3', 'A6','A4','A7')
col2 <- c('X1+Y1','A1+Y2', 'A1+Y3+A2', 'Y5+A2','A2+A1+A3','A2+A3')
df <- data.frame(col1, col2, stringsAsFactors=F)
require(data.tree)
# Create the graph/forest based on the data
getForest <- function(data) {
res <- new.env()
for( i in 1:nrow(data)){
nname <- data$col1[i]
if(!exists(nname,where=res))
assign(nname,Node$new(nname), pos=res)
par <- get(nname, envir=res)
print(par)
#Add the childs
deps <- unlist(regmatches(data$col2[i],gregexpr("A\\d+",data$col2[i])))
for( ch in deps) {
print("Ammm")
if(!exists(ch, where=res))
assign(ch,Node$new(ch), pos=res)
child <- get(ch, envir=res)
par$AddChildNode(child)
}
}
#Return the nodes
res
}
f <- getForest(df)
# Function to get the dependency level
getLevel<- function(node) {
if (node$count == 0)
return (0)
else {
return (length(node$children)+sum(sapply(node$children,getlevel)))
}
}
#Add dependency level to data frame
df$col3 <- sapply(df$col1, function(x) {getLevel(get(x,f))})
df
# col1 col2 col3
#1 A1 X1+Y1 0
#2 A2 A1+Y2 1
#3 A3 A1+Y3+A2 3
#4 A6 Y5+A2 2
#5 A4 A2+A1+A3 7
#6 A7 A2+A3 6

Column Split into columns and rows in R

My Data looks like
df <- data.frame(user_id=c('13','15'),
answer_id = c('{"row[0][0]":"A","row[0][1]":"B","row[0][2]":"C","row[0][3]":"D","row[1][0]":"A1","row[1][1]":"B1","row[1][2]":"C1","row[1][3]":"D1"}', '{"row[0][0]":"W","row[0][1]":"X","row[0][2]":"Y","row[0][3]":"Z","row[1][0]":"W1","row[1][1]":"X1","row[1][2]":"Y1","row[1][3]":"Z1"}
'))
Desired data view
user_id answer_id1 answer_id2 answer_id3 answer_id4
13 A B C D
13 A1 B1 C1 D1
15 W X Y Z
15 W1 X1 Y1 Z1
i'm new with R and hope to get solution soon as i do always
may not be the best solution but this can get you from your sample input to your desired output using stringr, purrr, & tidyr. See regex101 for an explanation of the regex used in the stringr::str_match_all() call.
df <- data.frame(user_id=c('13','15'),
answer_id = c('{"row[0][0]":"A","row[0][1]":"B","row[0][2]":"C","row[0][3]":"D","row[1][0]":"A1","row[1][1]":"B1","row[1][2]":"C1","row[1][3]":"D1"}', '{"row[0][0]":"W","row[0][1]":"X","row[0][2]":"Y","row[0][3]":"Z","row[1][0]":"W1","row[1][1]":"X1","row[1][2]":"Y1","row[1][3]":"Z1"}'),
stringsAsFactors=F)
#use regex to extract row ids and answers
regex_matches <- stringr::str_match_all(df$answer_id, '\\"row\\[(\\d+)\\]\\[(\\d+)\\]\\":\\"([^\\"]*)\\"')
#add user id to each result
answers_by_user <- purrr::map2(df$user_id, regex_matches, ~cbind(.x, .y[,-1]))
#combine list of matrices and convert to df
answers_df <- data.frame(do.call(rbind, answers_by_user))
#add meaningful names
names(answers_df) <- c("user_id", "row_1", "row_2", "value")
#convert to wide
spread_row_1 <- tidyr::spread(answers_df, row_1, value)
final_df <- tidyr::spread(answers_df, row_2, value)
#remove row column
final_df$row_1 <- NULL
#clean up names
names(final_df) <- c("user_id", "answer_id1", "answer_id2", "answer_id3", "answer_id4")
final_df
#output
user_id answer_id1 answer_id2 answer_id3 answer_id4
1 13 A B C D
2 13 A1 B1 C1 D1
3 15 W X Y Z
4 15 W1 X1 Y1 Z1
Column 2 looks like JSON, so you could do something like this to get it into a form that you can do something with...
library(rjson)
df2 <- lapply(1:nrow(df),function(i)
data.frame(user=df[i,1],
answer=unlist(fromJSON(as.character(df[i,2]))),stringsAsFactors = FALSE))
df2 <- do.call(rbind,df2)
df2[,"r1"] <- gsub(".+\\[(\\d)]\\[(\\d)].*","\\1",rownames(df2))
df2[,"r2"] <- gsub(".+\\[(\\d)]\\[(\\d)].*","\\2",rownames(df2))
df2
user answer r1 r2
row[0][0] 13 A 0 0
row[0][1] 13 B 0 1
row[0][2] 13 C 0 2
row[0][3] 13 D 0 3
row[1][0] 13 A1 1 0
row[1][1] 13 B1 1 1
row[1][2] 13 C1 1 2
row[1][3] 13 D1 1 3
row[0][0]1 15 W 0 0
row[0][1]1 15 X 0 1
row[0][2]1 15 Y 0 2
row[0][3]1 15 Z 0 3
row[1][0]1 15 W1 1 0
row[1][1]1 15 X1 1 1
row[1][2]1 15 Y1 1 2
row[1][3]1 15 Z1 1 3

Convert negative values to zero in dataframe in R

I have a dataframe in R that I would like to convert all columns (outside the ids) from negative to zero
id1 id2 var1 var2 var3
-1 -1 0 -33 5
-1 -2 9 -10 -1
I can convert all columns with code line like:
temp[temp < 0] <- 0
But I can't adjust it to only a subset of columns. I've tried:
temp[temp < 0, -c(1,2)] <- 0
But this errors saying non-existent rows not allowed
Edit a bit your variant
temp[,-c(1,2)][temp[, -c(1,2)] < 0] <- 0
You can try using replace:
> mydf[-c(1, 2)] <- replace(mydf[-c(1, 2)], mydf[-c(1, 2)] < 0, 0)
> mydf
id1 id2 var1 var2 var3
1 -1 -1 0 0 5
2 -1 -2 9 0 0
We can use data.table
setDT(d1)
for(j in grep('^var', names(d1))){
set(d1, i= which(d1[[j]]<0), j= j, value=0)
}
d1
# id1 id2 var1 var2 var3
# 1: -1 -1 0 0 5
# 2: -1 -2 9 0 0
There might be fancier or more compact ways, but here's a vectorised replacement you can apply to the var columns:
mytable <- read.table(textConnection("
id1 id2 var1 var2 var3
-1 -1 0 -33 5
-1 -2 9 -10 -1"), header = TRUE)
mytable[, grep("^var", names(mytable))] <-
apply(mytable[, grep("^var", names(mytable))], 2, function(x) ifelse(x < 0, 0, x))
mytable
## id1 id2 var1 var2 var3
## 1 -1 -1 0 0 5
## 2 -1 -2 9 0 0
You could use pmax:
dat <- data.frame(id1=c(-1,-1), id2=c(-1,-2), var1=c(0,9), var2=c(-33,10), var3=c(5,-1))
dat[,-c(1,2)] <- matrix(pmax(unlist(dat[,-c(1,2)]),0), nrow=nrow(dat))

How to suppress printing of 0 lines of a table?

I have a two factor vectors v1 and v2, which appear to be closely related (the entropy of each is very close to their joint entropy). Indeed, when I do table(v1,v2), I see something like this:
v2
v1 a2 b2 c2
a1 0 100 0
b1 0 0 0
c1 0 0 0
v2
v1 d2 e2 f2
a1 0 0 0
b1 0 0 0
c1 0 0 0
and so on - each factor has dozens of levels, so I get plenty of lines with all 0.
How to I print a table omitting lines which have only zeros in them?
Everybody seems to use rowSums(d)==0 or equivalent, but that will also suppress any row with equal numbers of ones and minus ones or any other zero sum combo. Safer would be to use:
d[ rowSums(d==0) != ncol(d) , ]
I suppose in the case where the object is the result of 'table', there would not be the risk of negative entries, but the risk would occur when this strategy is inappropariately applied to other settings.
Using your example:
v1 <- factor(rep("a1", 100), levels = paste0(letters[1:3], 1))
v2 <- factor(rep("b2", 100), levels = paste0(letters[1:6], 2))
R> table(v1, v2)
v2
v1 a2 b2 c2 d2 e2 f2
a1 0 100 0 0 0 0
b1 0 0 0 0 0 0
c1 0 0 0 0 0 0
Then the rowSums() function will compute the row sums for use. This works because a table is a either a vector or a matrix in disguise. Note in the sequence below showing intermediate steps how we convert the row sums into a logical vector by asking if they exceed 0.
R> rowSums(tab)
a1 b1 c1
100 0 0
R> rowSums(tab) > 0
a1 b1 c1
TRUE FALSE FALSE
R> tab[rowSums(tab) > 0, ]
a2 b2 c2 d2 e2 f2
0 100 0 0 0 0
The above drops the empty dimension. If you want to keep the table format, add drop = FALSE to the call, though note the extra , in there as we want all columns hence the empty argument between , ,:
R> tab[rowSums(tab) > 0, , drop = FALSE]
v2
v1 a2 b2 c2 d2 e2 f2
a1 0 100 0 0 0 0
I'd approach this with rowsums to get a logical vector of those greater than 0. And then use that vextor with indexing as in:
#make an example (please do this for yourself in the future)
d <- table(x=1:5, y=1:5)
d[1, 1] <- 0 #make one row have all 0s
d[rowSums(d) > 0, ]
Borrowing example data from #Gavin's answer
v1 <- factor(rep("a1", 100), levels = paste0(letters[1:3], 1))
v2 <- factor(rep("b2", 100), levels = paste0(letters[1:6], 2))
You can use droplevels to eliminate those value that do not appear anywhere (equivalent to rows with all 0's, or columns with all 0's)
> table(droplevels(v1), droplevels(v2))
b2
a1 100
If you only want to drop rows:
> table(droplevels(v1), v2)
v2
a2 b2 c2 d2 e2 f2
a1 0 100 0 0 0 0

Resources