remove subsequent values in row in R - r

I want to remove a specific repetitive value in each row in the dataframe.
let's say:
DF (3*5)
# c1 c2 c3 c4 c5
1 A A B A NA
2 C C A A B
3 B A A NA NA
I want to remove subsequent A value.
so New_df:
# c1 c2 c3 c4 c5
1 A B A NA NA
2 C C A B NA
3 B A NA NA NA
P.S. Column 5(c5) can be removed because all NA value.
The point is to remove only subsequent 'A' value, and other subsequent values should not be changed (e.g. the second row has two 'C' which should not be removed).
How can I do it in R?

Let's assume that you entered these as character values rather than as factors. That way we don't need the additional step of running DF[] <- lapply ( . , as.character).
DF <- read.table(text = "
A A B A NA
C C A A B
B A A NA NA", stringsAsFactors=FALSE)
It appears you only want the duplicated A's removed and shifted with the positions at the end replaced with NA's:
t( apply(DF, 1, function(x){ xR <- rle(x)
xR$lengths[xR$values == "A"] <- 1
x <- c( rep( xR$values, xR$lengths),
rep(NA, length(x)-sum(xR$lengths) ) ) }
)
)
[,1] [,2] [,3] [,4] [,5]
[1,] "A" "B" "A" NA NA
[2,] "C" "C" "A" "B" NA
[3,] "B" "A" NA NA NA

Loop through rows, use rle and get values:
# example data
DF <- read.table(text = "
A A B A NA
C C A A B
B A A NA NA")
data.frame(t(
apply(DF, 1, function(i){
res <- rle(i)$values
length(res) <- length(i)
res
})))
# output
# X1 X2 X3 X4 X5
# 1 A B A <NA> <NA>
# 2 C A B <NA> <NA>
# 3 B A <NA> <NA> <NA>

Is this what you're after?
as.data.frame(t(apply(DF, 1, function(x) {
idx <- which(x == "A");
x[-idx[c(0, diff(idx) == 1)]];
})))
# V1 V2 V3 V4
#1 A B A <NA>
#2 C C A B
#3 B A <NA> <NA>
Sample data
DF <- read.table(text =
"1 A A B A NA
2 C C A A B
3 B A A NA NA", header = F, row.names = 1)

Related

how to add a new row with extra column in R?

I was trying to add results of a for loop into a dataframe as new rows, but it gets an error when there is a new result with more columns than the original dataframe, how could I add the new result with extra columns to the dataframe with adding the extra column names to the original dataframe?
e.g.
original dataframe:
-______A B C
x1 1 1 1
x2 2 2 2
x3 3 3 3
I want to get
-______A B C D
x1 1 1 1 NA
x2 2 2 2 NA
x3 3 3 3 NA
X4 4 4 4 4
I tried rbind (Error in rbind(deparse.level, ...) :
numbers of columns of arguments do not match)
and rbind_fill (Error: All inputs to rbind.fill must be data.frames)
and bind_rows (Argument 2 must have names)
In base R, this can be done by creating a new column 'D' with NA and then assign new row with 4.
df1$D <- NA
df1['x4', ] <- 4
-output
> df1
A B C D
x1 1 1 1 NA
x2 2 2 2 NA
x3 3 3 3 NA
x4 4 4 4 4
Or in a single line
rbind(cbind(df1, D = NA), x4 = 4)
A B C D
x1 1 1 1 NA
x2 2 2 2 NA
x3 3 3 3 NA
x4 4 4 4 4
Regarding the error in bind_rows, it happens when the for loop output is not a named vector
library(dplyr)
> vec1 <- c(4, 4, 4, 4)
> bind_rows(df1, vec1)
Error: Argument 2 must have names.
Run `rlang::last_error()` to see where the error occurred.
If it is a named vector, then it should work
> vec1 <- c(A = 4, B = 4, C = 4, D = 4)
> bind_rows(df1, vec1)
A B C D
x1 1 1 1 NA
x2 2 2 2 NA
x3 3 3 3 NA
...4 4 4 4 4
data
df1 <- structure(list(A = 1:3, B = 1:3, C = 1:3),
class = "data.frame", row.names = c("x1",
"x2", "x3"))
You probably have something like this, if you list the elements of your for loop.
(l <- list(x1, x2, x3, x4, x5))
# [[1]]
# [1] 1 1 1
#
# [[2]]
# [1] 2 2 2 2
#
# [[3]]
# [1] 3 3
#
# [[4]]
# [1] 4
#
# [[5]]
# NULL
Multiple elements can be rbinded using a do.call(rbind, .) approach, your problem is, how to rbind multiple elements that differ in length.
There's a `length<-` function with which you may adjust the length of a vector. To know to which length, there's another function, lengths, that gives you the lengths of each list element, where you are interested in the maximum.
I include the special case when an element has length NULL (our 5th element of l); since length of NULL cannot be changed, replace those elements with NA.
So altogether you may do:
do.call(rbind, lapply(replace(l, lengths(l) == 0L, NA), `length<-`, max(lengths(l))))
# [,1] [,2] [,3] [,4]
# [1,] 1 1 1 NA
# [2,] 2 2 2 2
# [3,] 3 3 NA NA
# [4,] 4 NA NA NA
# [5,] NA NA NA NA
Or, since you probably want a data frame with pretty row and column names:
ml <- max(lengths(l))
do.call(rbind, lapply(replace(l, lengths(l) == 0L, NA), `length<-`, ml)) |>
as.data.frame() |> `dimnames<-`(list(paste0('x', 1:length(l)), LETTERS[1:ml]))
# A B C D
# x1 1 1 1 NA
# x2 2 2 2 2
# x3 3 3 NA NA
# x4 4 NA NA NA
# x5 NA NA NA NA
Note: R >= 4.1 used.
Data:
x1 <- rep(1, 3); x2 <- rep(2, 4); x3 <- rep(3, 2); x4 <- rep(4, 1); x5 <- NULL

How to merge elements of atomic vector in R?

I wanted to merge different elements of atomic vectors by elements names stored in list. See example:
ls = list(a = c(a = 1, b = 2, d = 2), b = c(b = 2, c = 3), c = c(a = 1, b = 2))
Now, I wanted to get output like this:
a b c
a 1 NA 1
b 2 2 2
c NA 3 NA
d 2 NA NA
I tried Reduce, but it is not working. I do not want to use any external package for this problem.
Thanks
You can use [ in sapply after you have extracted all elements names.
i <- sort(unique(unlist(lapply(ls, names))))
x <- sapply(ls, "[", i)
rownames(x) <- i
x
# a b c
#a 1 NA 1
#b 2 2 2
#c NA 3 NA
#d 2 NA NA
We could also use bind_rows here
library(dplyr)
library(tibble)
bind_rows(ls, .id = 'x') %>%
column_to_rownames('x') %>%
t
a b c
a 1 NA 1
b 2 2 2
d 2 NA NA
c NA 3 NA
Or using base R
xtabs(values ~ ind + x, do.call(rbind, Map(cbind, x = names(ls), lapply(ls, stack))))
x
ind a b c
a 1 0 1
b 2 2 2
d 2 0 0
c 0 3 0
A data.table option using rbindlist
> t(rbindlist(Map(function(x) data.table(t(x)), lst), fill = TRUE))
[,1] [,2] [,3]
a 1 NA 1
b 2 2 2
d 2 NA NA
c NA 3 NA

Similar but different to: Split unique values into separate columns for multiple columns

My question is if I have the next data frame in R.
a<-data.frame(col1=c("a","a","a","d","a"),
col2=c("b","b","c","e","e"),
col3=c("c","d","e",NA,NA),
col4=c("d","e",NA,NA,NA),
col5=c("e",NA,NA,NA,NA))
print(a)
col1| col2| col3| col4| col5|
a b c d e
a b d e NA
a c e NA NA
d e NA NA NA
a e NA NA NA
I need other data frame like this:
b<-data.frame(col1=c("a","a","a",NA,"a"),
col2=c("b","b",NA,NA,NA),
col3=c("c",NA,"c",NA,NA),
col4=c("d","d",NA,"d",NA),
col5=c("e","e","e","e","e"))
print(b)
col1| col2| col3| col4| col5|
a b c d e
a b NA d e
a NA c NA e
NA NA NA d e
a NA NA NA e
Sorry, I don't have the concepts to explain my problem, is for this reason that I question ask, but I guess that, I want first: separate by columns rows that be distinct to rest of group in a new column, and second, to get the rows that have the same values in a same column.
I think that my problem is similar to this: Split unique values into separate columns for multiple columns
If someone can help me I will be very thankful.
Using some tidyverse libraryies you can do
library(dplyr)
library(tidyr)
a %>%
mutate(id=row_number()) %>%
pivot_longer(-id) %>%
filter(!is.na(value)) %>%
pivot_wider(id_cols=id, names_from="value", values_from="value") %>%
select(-id)
We use the pivot functions to reshape and transform the data. The tricks is just to add the id column to make it easier to rack the data on a per-row basis. This returns
a b c d e
<chr> <chr> <chr> <chr> <chr>
1 a b c d e
2 a b NA d e
3 a NA c NA e
4 NA NA NA d e
5 a NA NA NA e
Another base R option:
setNames(data.frame(sapply(sort(na.omit(unique(unlist(a)))),
function(x) ifelse(rowSums(a==x, na.rm=TRUE) > 0, x, NA))), colnames(a))
#> col1 col2 col3 col4 col5
#> 1 a b c d e
#> 2 a b <NA> d e
#> 3 a <NA> c <NA> e
#> 4 <NA> <NA> <NA> d e
#> 5 a <NA> <NA> <NA> e
We can do this in base R
t(apply(a, 1, function(x) {
v1 <- character(length(x))
v1[match(x, letters, nomatch = 0)] <- x
v1}))
# [,1] [,2] [,3] [,4] [,5]
#[1,] "a" "b" "c" "d" "e"
#[2,] "a" "b" "" "d" "e"
#[3,] "a" "" "c" "" "e"
#[4,] "" "" "" "d" "e"
#[5,] "a" "" "" "" "e"
Or another option is
b <- a
m1 <- t(apply(a, 1, function(x) {table(factor(x, levels = letters[1:5]))})) > 0
b[] <- colnames(m1)[col(m1)* NA^!m1]
b
# col1 col2 col3 col4 col5
#1 a b c d e
#2 a b <NA> d e
#3 a <NA> c <NA> e
#4 <NA> <NA> <NA> d e
#5 a <NA> <NA> <NA> e
Or a slight variation of the above
t(apply(a, 1, function(x) {
tbl1 <- table(factor(x, levels = letters[1:5]))
ifelse(tbl1 >0, names(tbl1), NA)}))

Conditional mutate and vector

I have the following data frame:
df <- data.frame(
x = rep(letters[1:3], 2)
)
And the following vector:
vec <- c(1.5, 3.2)
This vector belongs to each b in df. How do I mutate vec if it matches b and return NA values if not?
Expected outcome:
1 a NA
2 b 1.5
3 c NA
4 a NA
5 b 3.2
6 c NA
Simplest way would be to get indexes of "b" and replace them with vec.
df$output[df$x == "b"] <- vec
df
# x output
#1 a NA
#2 b 1.5
#3 c NA
#4 a NA
#5 b 3.2
#6 c NA
Another option is with replace
df$output <- replace(df$output, df$x == "b", vec)
Forcefully, fitting this into tidyverse
library(dplyr)
df$output <- NA
df %>%
mutate(output = replace(output, x == "b", vec))

Match a list of items with rows items of a data.frame

Hi guys I have a difficult situation to manage:
I have a data.frame that looks like this:
General_name
a
b
c
d
m
n
and another data.frame that looks like this:
First_names_list a=34;b=4
Second_names_list d=2;m=98;n=32
Third_names_list c=1;d=12;m=0.1
I have to match each element of the first data.frame with each element before = in the second data.frame[,2] so that finally I have to obtain the following table:
Names a b c d m n
First_names_list 34 4 NA NA NA NA
Second_names_list NA NA NA 2 98 32
Third_names_list NA NA 1 12 0.1 NA
Any suggestion? It seems to be too difficult to me.
Best
E.
Option 1
Here is one approach using dcast from "reshape2" and concat.split from my "splitstackshape" package:
library(splitstackshape)
## The following can also be done in 2 steps. The basic idea is to split
## the values into a semi-long form for `dcast` to be able to use. So,
## I've split first on the semicolon, and made the data into a long form
## at the same time, then I've split on =, but kept it wide that time.
out <- concat.split(concat.split.multiple(df, "V2", ";", "long"),
"V2", "=", drop = TRUE)
out
# V1 time V2_1 V2_2
# 1 First_names_list 1 a 34.0
# 2 Second_names_list 1 d 2.0
# 3 Third_names_list 1 c 1.0
# 4 First_names_list 2 b 4.0
# 5 Second_names_list 2 m 98.0
# 6 Third_names_list 2 d 12.0
# 7 First_names_list 3 <NA> NA
# 8 Second_names_list 3 n 32.0
# 9 Third_names_list 3 m 0.1
library(reshape2)
dcast(out[complete.cases(out), ], V1 ~ V2_1, value.var="V2_2")
# V1 a b c d m n
# 1 First_names_list 34 4 NA NA NA NA
# 2 Second_names_list NA NA NA 2 98.0 32
# 3 Third_names_list NA NA 1 12 0.1 NA
Option 2
Here's another option using a more recent version of data.table. The concept is very similar to the approach taken above.
library(data.table)
library(reshape2)
packageVersion("data.table")
# [1] ‘1.8.11’
dt <- data.table(df)
S1 <- dt[, list(X = unlist(strsplit(as.character(V2), ";"))), by = V1]
S1[, c("A", "B") := do.call(rbind.data.frame, strsplit(X, "="))]
S1
# V1 X A B
# 1: First_names_list a=34 a 34
# 2: First_names_list b=4 b 4
# 3: Second_names_list d=2 d 2
# 4: Second_names_list m=98 m 98
# 5: Second_names_list n=32 n 32
# 6: Third_names_list c=1 c 1
# 7: Third_names_list d=12 d 12
# 8: Third_names_list m=0.1 m 0.1
dcast.data.table(S1, V1 ~ A, value.var="B")
# V1 a b c d m n
# 1: First_names_list 34 4 NA NA NA NA
# 2: Second_names_list NA NA NA 2 98 32
# 3: Third_names_list NA NA 1 12 0.1 NA
Both of the above options assume we're starting with:
df <- structure(list(V1 = c("First_names_list", "Second_names_list",
"Third_names_list"), V2 = c("a=34;b=4", "d=2;m=98;n=32",
"c=1;d=12;m=0.1")), .Names = c("V1", "V2"), class = "data.frame",
row.names = c(NA, -3L))
Here is a solution, using apply within apply:
#Data frame 1
df1 <- read.table(text=
"General_name
a
b
c
d
m
n", header=T, as.is=T)
#Data frame 2
df2 <- read.table(text=
"col1 col2
First_names_list a=34;b=4
Second_names_list d=2;m=98;n=32
Third_names_list c=1;d=12;m=0.1", header=T, as.is=T)
#make lists for each row, sep by ";"
df2split <- strsplit(df2$col2,split=";")
#result
t(
sapply(seq(1:nrow(df2)),function(c){
x <- df2split[[c]]
sapply(df1$General_name,function(n){
t <- gsub(paste0(n,"="),"",x[grepl(n,x)])
ifelse(length(t)==0,NA,as.numeric(t))
})
})
)
I feel this is a slightly round-about way to do it so I look forward to a better solution as well. But this works.
library(data.table)
library(reshape2)
#creating datasets
dt <- data.table(read.csv(textConnection('
"First_names_list","a=34;b=4"
"Second_names_list","d=2;m=98;n=32"
"Third_names_list","c=1;d=12;m=0.1"
'),header = FALSE))
General_name = c('a','b','c','d','m','n')
TotalBreakup <- data.table(
V1 = General_name
)
# Fixing datatypes
TotalBreakup <- TotalBreakup[,lapply(.SD,as.character)]
dt <- dt[,lapply(.SD,as.character)]
# looping through each row and calculating breakdown
for(i in 1:nrow(dt))
{
# the next two statements are the workhorse of this code. Run each part of these statements step by step to see
dtlist <- strsplit(unlist(strsplit(dt[i,V2],";")),"=")
breakup <- data.table(
t(
matrix(
unlist(
strsplit(
unlist(
strsplit(
dt[i,V2],
";"
)
),
"="
)
),
nrow = 2
)
)
)
# fixing datatypes again
breakup <- breakup[,lapply(.SD,as.character)]
#appending to master dataset
TotalBreakup <- merge(TotalBreakup, breakup, by = "V1", all.x = TRUE)
}
#formatting results
setnames(TotalBreakup,c("Names",dt[,V1]))
TotalBreakup <- acast(melt(TotalBreakup,id.vars = "Names"),variable~Names)
Output -
> TotalBreakup
a b c d m n
First_names_list "34" "4" NA NA NA NA
Second_names_list NA NA NA "2" "98" "32"
Third_names_list NA NA "1" "12" "0.1" NA
A way is this:
#the second dataframe you provided
DF2 <- read.table(text = '
First_names_list a=34;b=4
Second_names_list d=2;m=98;n=32
Third_names_list c=1;d=12;m=0.1
', header = F, stringsAsFactors = F)
#empty dataframe
DF <- structure(list(a = c(NA, NA, NA), b = c(NA, NA, NA), c = c(NA,
NA, NA), d = c(NA, NA, NA), m = c(NA, NA, NA), n = c(NA, NA,
NA)), .Names = c("a", "b", "c", "d", "m", "n"), row.names = c("First_names_list",
"Second_names_list", "Third_names_list"), class = "data.frame")
DF
# a b c d m n
#First_names_list NA NA NA NA NA NA
#Second_names_list NA NA NA NA NA NA
#Third_names_list NA NA NA NA NA NA
#fill the dataframe
myls <- strsplit(DF2$V2, split = ";")
for(i in 1:length(myls))
{
sapply(myls[[i]],
function(x) { res <- unlist(strsplit(x, "=")) ; DF[i,res[1]] <<- res[2] })
}
DF
# a b c d m n
#First_names_list 34 4 <NA> <NA> <NA> <NA>
#Second_names_list <NA> <NA> <NA> 2 98 32
#Third_names_list <NA> <NA> 1 12 0.1 <NA>

Resources