reshape data and coerce missing to zero - r

EDIT:
The original dataset can be found here: link
I have a matrix like:
data <- matrix(c("a","1","10",
"b","1","20",
"c","1","30",
"a","2","10",
"b","2","20",
"a","3","10",
"c","3","20"),
ncol=3, byrow=TRUE)
I would like to reshape as a dataframe coercing the missing values to zero:
data <- matrix(c("a","1","10",
"b","1","20",
"c","1","30",
"a","2","10",
"b","2","20",
"c","2","0",
"a","3","10",
"b","3","0",
"c","3","20"),
ncol=3, byrow=TRUE)
How can I do it with the reshape package?
Thaks

We can use complete from tidyr, after converting your data a little:
library(tidyr)
data <- as.data.frame(data)
data$V3 <- as.numeric(as.character(data$V3))
complete(data, V1, V2, fill = list(V3 = 0))

tidyr better but if you want use reshape you can
library(reshape2)
data2=dcast(data = as.data.frame(data),V1~V2)
data3=melt( data2,measure.vars=colnames(data2)[-1])
data3[is.na(data3)]="0"

Seems to me like you are handling something like a multivariate time series. Therefore I would suggest using a proper time series object.
library(zoo)
res=read.zoo(data.frame(data,stringsAsFactors=FALSE),
split=1,
index.column=2,
FUN=as.numeric)
coredata(res)=as.numeric(coredata(res))
coredata(res)[is.na(res)]=0
This gives
res
# a b c
#1 10 20 30
#2 10 20 0
#3 10 0 20

I think you are doing it wrong by having a matrix with multiple classes.
First I would convert to a data.frame or to a data.table and then convert all the column to the proper type. Something like
library(data.table) # V 1.9.6+
# Convert to data.table
DT <- as.data.table(data)
# Convert to correct column types
for(j in names(DT)) set(DT, j = j, value = type.convert(DT[[j]]))
Then we can expand rows using data.table::CJ and assign zeroes to NA values
## Cross join all column except the third
DT <- DT[do.call(CJ, c(unique = TRUE, DT[, -3, with = FALSE])), on = names(DT)[-3]]
## Or if you want only to operate on these two columns you can alternatively do
# DT <- DT[CJ(V1, V2, unique = TRUE), on = c("V1", "V2")]
## Fill with zeroes
DT[is.na(V3), V3 := 0]
DT
# V1 V2 V3
# 1: a 1 10
# 2: a 2 10
# 3: a 3 10
# 4: b 1 20
# 5: b 2 20
# 6: b 3 0
# 7: c 1 30
# 8: c 2 0
# 9: c 3 20

Related

Replace values in a data.table based on row values in another table

I have two data.tables:
left_table <- data.table(a = c(1,2,3,4), b = c(4,5,6,7), c = c(8,9,10,11))
right_table <- data.table(record = sample(LETTERS, 9))
I would like to replace the numeric entries in left_table by the values associated with the corresponding row numbers in right_table. e.g. All instances of 4 in left_table are replaced by whatever letter (or set of characters in my real data) is on row 4 of right_table and so on.
I have this solution but I feel it's a bit cumbersome and a simpler solution must be possible?
right_table <- data.table(row_n = as.character(seq_along(1:9)), right_table)
for (i in seq_along(left_table)){
cols <- colnames(left_table)
current_col <- cols[i]
# convert numbers to character to allow := to work for matching records
left_table[,(current_col) := lapply(.SD, as.character), .SDcols = current_col]
#right_table[,(current_col) := lapply(.SD, as.character), .SDcols = current_col]
#set key for quick joins
setkeyv(left_table, current_col)
setkeyv(right_table, "row_n")
# replace matching records
left_table[right_table, (current_col) := record]
}
You can create the new columns fetching the letters from right_table using the original variables.
left_table[, c("newa","newb","newc") :=
.(right_table[a,record],right_table[b,record],right_table[c,record])]
# a b c newa newb newc
# 1: 1 4 8 Y A R
# 2: 2 5 9 D B W
# 3: 3 6 10 G K <NA>
# 4: 4 7 11 A N <NA>
Edit:
To make it more generic:
columnNames <- names(left_table)
left_table[, (columnNames) :=
lapply(columnNames, function(x) right_table[left_table[,get(x)],record])]
Although there is probably a better way to do this without needing to call left_table inside lapply()
Using mapvalue from plyr:
library(plyr)
corresp <- function(x) mapvalues(x,seq(right_table$record),right_table$record)
left_table[,c(names(left_table)) := lapply(.SD,corresp),.SDcols = names(left_table)]
a b c
1: N K X
2: U Q V
3: Z I 10
4: K G 11
Here is my attempt. When we replace the numeric values to character values, we get NAs as we see from some other answers. So I decided to take another way. First, I created a vector using unlist(). Then, I used fifelse() from the data.table package. I used foo as indices and replaces numbers in foo with characters. I also converted numeric to character (i.e., 10 and 11 in the sample data). Then, I created a matrix and converted it to a data.table object. Finally, I assigned column names to the object.
library(data.table)
foo <- unlist(left_table)
temp <- fifelse(test = foo <= nrow(right_table),
yes = right_table$record[foo],
no = as.character(foo))
res <- as.data.table(matrix(data = temp, nrow = nrow(left_table)))
setnames(res, names(left_table))
# a b c
#1: B G J
#2: Y D I
#3: P T 10
#4: G S 11
I think it might be easier to just keep record as a vector and access it via indexing:
left_table <- data.table(a = c(1,2,3,4), b = c(4,5,6,7), c = c(8,9,10,11))
# a b c
#1: 1 4 8
#2: 2 5 9
#3: 3 6 10
#4: 4 7 11
set.seed(0L)
right_table <- data.table(record = sample(LETTERS, 9))
record <- right_table$record
#[1] "N" "Y" "D" "G" "A" "B" "K" "Z" "R"
left_table[, names(left_table) := lapply(.SD, function(k) fcoalesce(record[k], as.character(k)))]
left_table
# a b c
# 1: N G Z
# 2: Y A R
# 3: D B 10
# 4: G K 11

R, how to replace only the numeric values of a dataframe?

I am working on R 3.4.3 on Windows 10. I have a dataframe made of numeric values and characters.
I would like to replace only the numeric values but when I do that the characters also change and are replaced.
How can I edit my function to make it affect only the numeric values and not the characters?
Here is the piece of code of my function:
dataframeChange <- function(dFrame){
thresholdVal <- 20
dFrame[dFrame >= thresholdVal] <- -1
return(dFrame)
}
Here is a dataframe example:
example_df <- data.frame(
myNums = c (1:5),
myChars = c("A","B","C","D","E"),
stringsAsFactors = FALSE
)
Thanks for the help!
As Tim's comment, you should be aware of the location of the numeric columns which we can locate them using ind <- sapply(dFrame, is.numeric)
dataframeChange <- function(dFrame){
#browser()
thresholdVal <- 20
ind <- sapply(dFrame, is.numeric)
dFrame[(dFrame[,ind] >= thresholdVal),ind] <- -1
#dFrame[dFrame >= thresholdVal] <- -1
return(dFrame)
}
Use mutate_if from dplyr:
library(dplyr)
example_df %>% mutate_if(is.numeric, funs(if_else(. >= thresh, repl, .)))
myNums myChars
1 10 A
2 -1 B
3 -1 C
4 5 D
5 -1 E
Explanation:
The mutate family of functions is for variable assignment or updating.
mutate_if functions (specified within funs()) are only applied to columns which satisfy the first argument (in this case, is.numeric())
The updating function is a simple if_else clause based on OP rules.
Data:
thresh <- 20
repl <- -1.0
example_df <- data.frame(
myNums = c(10,20,30,5,70),
myChars = c("A","B","C","D","E"),
stringsAsFactors = FALSE
)
example_df
myNums myChars
1 10 A
2 20 B
3 30 C
4 5 D
5 70 E
Using data.table, we can avoid explicit loops and is faster. Here I've set the threshold value as 2:
# set to data table
setDT(example_df)
# get numeric columns
num_cols <- names(example_df)[sapply(example_df, is.numeric)]
# loop over all columns at once
example_df[,(num_cols) := lapply(.SD, function(x) ifelse(x>2,-1, x)), .SDcols=num_cols]
print(example_df)
myNums myChars
1: 1 A
2: 2 B
3: -1 C
4: -1 D
5: -1 E
Another data.table solution.
library(data.table)
dataframeChange <- function(dFrame){
setDT(dFrame)
for(j in seq_along(dFrame)){
set(dFrame, i= which(dFrame[[j]] < 20), j = j, value = -1)
}
}
dataframeChange_dt(example_df)
example_df
# myNums myChars
# 1: -1 A
# 2: 20 B
# 3: 30 C
# 4: -1 D
# 5: 70 E
It does not explicitly call only numeric columns, however I tested on multiple datasets and it does not effect the non-numeric columns.

How to append values of columns? [duplicate]

This question already has answers here:
paste two data.table columns
(4 answers)
Closed 6 years ago.
For example there is the following data.table:
dt <- data.table(x = list(1:2, 3:5, 6:9), y = c(1,2,3))
# x y
# 1: 1,2 1
# 2: 3,4,5 2
# 3: 6,7,8,9 3
I need to create a new data.table, where values of the y column will be appended to lists stored in the x column:
# z
# 1: 1,2,1
# 2: 3,4,5,2
# 3: 6,7,8,9,3
I've tried lapply, cbind, list, c functions. But I can't get the table I need.
UPDATE:
The question is different from paste two data.table columns because a trivial solution with paste function or something like this doesn't work.
This will do it
# Merge two lists
dt[, z := mapply(c, x, y, SIMPLIFY=FALSE)]
print(dt)
x y z
1: 1,2 1 1,2,1
2: 3,4,5 2 3,4,5,2
3: 6,7,8,9 3 6,7,8,9,3
And deleting the original x and y columns
dt[, c("x", "y") := NULL]
print(dt)
z
1: 1,2,1
2: 3,4,5,2
3: 6,7,8,9,3
I would like to suggest a general approach for this kind of task in case you have multiple columns that you would like to combine into a single column
An example data with multiple columns
dt <- data.table(x = list(1:2, 3:5, 6:9), y = 1:3, z = list(4:6, NULL, 5:8))
Solution
res <- melt(dt, measure.vars = names(dt))[, .(.(unlist(value))), by = rowid(variable)]
res$V1
# [[1]]
# [1] 1 2 1 4 5 6
#
# [[2]]
# [1] 3 4 5 2
#
# [[3]]
# [1] 6 7 8 9 3 5 6 7 8
The idea here is to convert to long format and then unlist/list by group
(You will receive an warning due to different classes in the resulting value column)

Generate All ID Pairs, by group with data.table in R

I have a data.table with many individuals (with ids) in many groups. Within each group, I would like to find every combination of ids (every pair of individuals). I know how to do this with a split-apply-combine approach, but I am hoping that a data.table would be faster.
Sample data:
dat <- data.table(ids=1:20, groups=sample(x=c("A","B","C"), 20, replace=TRUE))
Split-Apply-Combine Method:
datS <- split(dat, f=dat$groups)
datSc <- lapply(datS, function(x){ as.data.table(t(combn(x$ids, 2)))})
rbindlist(datSc)
head(rbindlist(datSc))
V1 V2
1: 2 5
2: 2 10
3: 2 19
4: 5 10
5: 5 19
6: 10 19
My best data.table attempt produces a single column, not two columns with all the possible combinations:
dat[, combn(x=ids, m=2), by=groups]
Thanks in advance.
You need to convert the result from t(combn()) which is a matrix to a data.table or data.frame, so this should work:
library(data.table)
set.seed(10)
dat <- data.table(ids=1:20, groups=sample(x=c("A","B","C"), 20, replace=TRUE))
dt <- dat[, as.data.table(t(combn(ids, 2))), .(groups)]
head(dt)
groups V1 V2
1: C 1 3
2: C 1 5
3: C 1 7
4: C 1 10
5: C 1 13
6: C 1 14
library(data.table)
dat <- data.table(ids=1:20, groups=sample(x=c("A","B","C"), 20, replace=TRUE))
ind<-unique(dat$groups)
lapply(1:length(ind), function (i) combn(dat$ids[which(dat$groups==ind[i])],2))
You can then change the list to any other type of format you might need.

Random subsets in function of one column in r

I want to extract n rows randomly from a data frame in function of one column.
So with this example :
# Reproducible example
df <- as.data.frame(matrix(0,2e+6,2))
df$V1 <- runif(nrow(df),0,1)
df$V2 <- sample(c(1:10),nrow(df), replace=TRUE)
df$V3 <- sample(c("A","B","C"),nrow(df), replace=TRUE)
I want to extract, for example, n=10rows for each value of V2.
# Example of what I need with one value of V2
df1 <- df[which(df$V2==1),]
str(df1)
df1[sample(1:nrow(df1),10),]
I do not want to do any for-loopso I tried this line with tapply:
df_objective <- tapply(df$V1, df$V2, function(x) df[sample(1:nrow(df),10),"V2"])
which is close to what I want but I lost the third column of the data frame.
I tried this to have complete subsets :
df_objective <- by(cbind(df$V1,df$V3), df$V2, function(x) df[sample(1:nrow(df),10),"V2"])
but it does not help.
How can I keep all the columns in the subsets ?
It sounds like you're just looking for something like sample_n from "dplyr":
library(dplyr)
df %>% group_by(V2) %>% sample_n(10)
# Source: local data frame [100 x 3]
# Groups: V2
#
# V1 V2 V3
# 1 0.51099392 1 B
# 2 0.87098866 1 A
# 3 0.13647752 1 B
# 4 0.15348834 1 B
# 5 0.94096127 1 B
# 6 0.05673849 1 A
# 7 0.69960842 1 C
# 8 0.02246671 1 C
# 9 0.88903430 1 B
# 10 0.52128253 1 A
# .. ... .. ..
Alternatively, there's stratified from my "splitstackshape" package.
library(splitstackshape)
stratified(df, "V2", 10)
You can try
library(data.table)
setDT(df)[, .SD[sample(.N, 10)] , V2]
Or a faster option as suggested by #Frank
setDT(df)[df[,sample(.I,10),V2]$V1]
You want to sample from the rows, so that should be the first arg to tapply, not V1:
myrows <- unlist(tapply(1:nrow(df),df$V2,sample,size=10))
df1[myrows,]

Resources