Collapse columns into a new variable

Collapse columns into a new variable - r

I have a data frame that looks like this.
name = c("p1","p2","p3","p4")
place = c("f","g","h","i")
v1 = c("x", "NA", "NA", "NA")
v2 = c("NA", "y", "y", "NA")
v3 = c("NA", "NA", "z", "NA")
region = c("n","w","s","e")
grade = c("f1","f2","f3","f4")
df = data.frame(name, place, v1, v2, v3, region, grade)
name place v1 v2 v3 region grade
1 p1 f x NA NA n f1
2 p2 g NA y NA w f2
3 p3 h NA y z s f3
4 p4 i NA NA NA e f4
I would like to add a new character vector v4 that contains the character from any of columns v1 v2 v3.
name place v1 v2 v3 v4 region grade
1 p1 f x NA NA x n f1
2 p2 g NA y NA y w f2
3 p3 h NA y z yz s f3
4 p4 i NA NA NA NA e f4
many thanks

We can use paste after converting the columns to character
df$V4 <- gsub("NA", "", do.call(paste, c(df[3:5], sep="")))
df$V4[df$V4==""] <- NA
df$V4
#[1] "x" "y" "yz" NA
data
df <- structure(list(name = c("p1", "p2", "p3", "p4"), place = c("f",
"g", "h", "i"), v1 = c("x", NA, NA, NA), v2 = c(NA, "y", "y",
NA), v3 = c(NA, NA, "z", NA), region = c("n", "w", "s", "e"),
grade = c("f1", "f2", "f3", "f4")), .Names = c("name", "place",
"v1", "v2", "v3", "region", "grade"), class = "data.frame",
row.names = c("1", "2", "3", "4"))

The dplyr alternative:
install.packages("dplyr")
library(dplyr)
df <- df %>%
mutate( v4 = gsub( "NA", "", paste0(v1,v2,v3) ) ) %>%
mutate( v4 = ifelse( v4 == "", NA, v4 ) )
This should work if NA/"NA" is NA (not-a-value) or "NA" (character). And if you don't care whether v4 contains "" or NA, you can leave off the last line (and delete the last pipe).

Related

Columns as named list

After importing a Rdata object, I have a dataframe, in which the columns are stored as 'named list'. How do I unlist them?
structure(list(x1 = list(V1 = "1.", V2 = "2.", V3 = "3.", V4 = "4.",
V5 = "5."), company_name = list(V1 = "A", V2 = "B",
V3 = "C",
V4 = "D", V5 = "E"),
registered_office_address_commune = list(V1 = "Padova", V2 = "Padova",
V3 = "MISSING DATA", V4 = "MISSING DATA", V5 = "Padova")), row.names = c("V1",
"V2", "V3", "V4", "V5"), class = "data.frame")
glimpse(df) gives
Rows: 5
Columns: 3
$ x1 <named list> ["1.", "2.", "3.", "4.", "5."]
$ company_name <named list> ["A", "B", "C", "D", "E"]
$ registered_office_address_commune <named list> ["Padova", "Padova", "MISSING DATA", "MISSING DATA", "Padova"]
I would like to have a 'normal' dataframe, with columns saved either as numerical or character,
Can anyone help?

dat <- structure(
list(
x1 = list(
V1 = "1.", V2 = "2.", V3 = "3.", V4 = "4.", V5 = "5."
),
company_name = list(
V1 = "A", V2 = "B", V3 = "C", V4 = "D", V5 = "E"
),
registered_office_address_commune = list(
V1 = "Padova", V2 = "Padova", V3 = "MISSING DATA",
V4 = "MISSING DATA", V5 = "Padova"
)
),
row.names = c("V1", "V2", "V3", "V4", "V5"),
class = "data.frame"
)
How do I unlist them?
You could iteratively call unlist() on columns. Using base R, you can do this as
lst <- lapply(df, unlist))
which gives you a list of atomic vectors, which then can be coerced to data.frame as
res <- as.data.frame(lst)
str(res)
#> 'data.frame': 5 obs. of 3 variables:
#> $ x1 : chr "1." "2." "3." "4." ...
#> $ company_name : chr "A" "B" "C" "D" ...
#> $ registered_office_address_commune: chr "Padova" "Padova" "MISSING DATA" "MISSING DATA" ...

We can use unnest
library(dplyr)
library(tidyr)
df %>%
unnest
# A tibble: 5 x 3
# x1 company_name registered_office_address_commune
# <chr> <chr> <chr>
#1 1. A Padova
#2 2. B Padova
#3 3. C MISSING DATA
#4 4. D MISSING DATA
#5 5. E Padova

How to reshape scanned input of pairs of data from data set with unequal numbers of columns

I have pairs of data scanned where the number of pairs on a line is variable in number.
R code:
x <- scan(paste0(dirdata,"df.txt"), what = "", sep = " ")
Lines in data set look like
v1 v2 v3 v4 v5 v6 V7 V8
"A" "35" "B" "32" "Z" "67" "F" 17
"F" "17" NA NA NA NA NA NA
I want to reshape it to look like
v1 v2
"A" "35"
"B" "32"
"Z" "67"
"F" "17"
Can this be done in base R? How? in reshape package? How?
I am doing things two ways (1) using base R code and (2) using a package such as reshape.
Output from an unlist function is given below.
d1 <- data.frame(v1 = unlist(df1[,c(TRUE, FALSE)]),
v2 = unlist(df1[,c(FALSE, TRUE)]))
data.frame(lapply(d1, na.omit))
head(d1)
v1 v2
V21 A 1
V22 B 50
V23 Z 74
V24 F 3
Can someone explain what the unlist function is doing and is column one of the output line numbers?
Can you help, please? Thanks.
MM

If it is data.frame
d1 <- data.frame(v1 = unlist(df1[,c(TRUE, FALSE)]),
v2 = unlist(df1[,c(FALSE, TRUE)]))
data.frame(lapply(d1, na.omit))
# v1 v2
#1 A 35
#2 F 17
#3 B 32
#4 Z 67
#5 F 17
or using reshape from base R
na.omit(reshape(transform(df1, rn = seq_len(nrow(df1))),
direction = 'long', idvar = 'rn',
varying = list(seq(1, ncol(df1), by = 2), seq(2, ncol(df1), by = 2)))[3:4])
data
df1 <- structure(list(v1 = c("A", "F"), v2 = c(35L, 17L), v3 = c("B",
NA), v4 = c(32L, NA), v5 = c("Z", NA), v6 = c(67L, NA), V7 = c("F",
NA), V8 = c(17L, NA)), row.names = c(NA, -2L), class = "data.frame")

subset of R undefined columns

I'm trying to use subset to get values from the union of two tables
> ans<-subset(table2, select=rownames(table1))
But i get the following error:
Error in [.data.frame(x, r, vars, drop = drop) : undefined columns selected
Given table1
V2
E x
F x
G x
H x
And table2
V1 V2 V3 V4 V5 V6
1 A B C D E F
2 2 5 6 4 6 8
I want to obtain:
E F
6 8

Used this data:
table1 <- structure(list(V2 = structure(c(1L, 1L, 1L, 1L), .Label = "x", class = "factor")), class = "data.frame", row.names = c("E",
"F", "G", "H"))
structure(list(X1 = c("A", "2"), X2 = c("B", "5"), X3 = c("C",
"6"), X4 = c("D", "4"), X5 = c("E", "6"), X6 = c("F", "8")), class = "data.frame", row.names = c(NA,
-2L))
Note: This does not work if the data structure is factors. I assembled table2 with:
table2 <- data.frame(rbind(as.character(LETTERS[1:6]), c(2, 5, 6, 4, 6, 8)), stringsAsFactors = FALSE)
So, then this works:
ans <- table2[, as.character(table2[1, ]) %in% rownames(table1)]
ans

Column names into data frame

How can I create a data frame which contains the column names of all Environment objects (df)
Ex. Having this 3 df as all the objects in the global environment.
chocolate <- data.frame(a = 1, b = 2, c = 3)
banana <- data.frame(a = 2, d = 4, c = 3)
pear <- data.frame(d = 1, e = 4)
Desired output
output <- data.frame(id = c("chocolate","banana", "pear"),
v2 = c("a", "a", NA),
v3 = c("b", NA, NA),
v4 = c("c", "c", NA),
v5 = c(NA, "d", "d"),
v6 = c(NA, NA, "e"))
output

We can try
library(data.table)
lst <- mget(paste0("df", 1:3))
setnames(rbindlist(lapply(setNames(lst, seq_along(lst)), function(x) {
x[] <- names(x)
x}), fill = TRUE, idcol = 'id'), 2:6, paste0("V", 1:5))[]
# id V1 V2 V3 V4 V5
#1: 1 a b c NA NA
#2: 2 a NA c d NA
#3: 3 NA NA NA d e

how to delete duplicated duplicated of each column after group_by() using `dplyr` package

I have a data.frame mydata like this
V1 V2 V3 V4 V5
1 a b a
2 a b c
3 a b d
4 x y h
5 x y k e
I want to group it by the columns V1and V2, and delete the "" string in the other columns
the result should like this
V1 V2 V3 V4 V5
1 a b a c d
2 x y h k e
is their a efficient way to do this using the dplyr package? Thank you very much.

Using base R, if that's of interest
x <- data.frame(V1 = c(rep("a", 3), "x", "x"),
V2 = c(rep("b", 3), "y", "y"),
V3= c("a", "", "", "h", ""),
V4 = c("", "c", "", "", "k"),
V5 = c(rep("", 2), "d", "", "e"))
temp <- lapply(x[], function(y) as.character(unique(y[y != ""])))
data.frame(do.call(cbind,temp))
V1 V2 V3 V4 V5
1 a b a c d
2 x y h k e

We can use dplyr/tidyr. We reshape the data from 'wide' to 'long' using gather, remove the blank elements in the 'Val' column with filter, and reshape it back to 'wide' format with spread.
library(dplyr)
library(tidyr)
gather(mydata, Var, Val, V3:V5) %>%
filter(Val!='') %>%
spread(Var, Val)
# V1 V2 V3 V4 V5
#1 a b a c d
#2 x y h k e
Or another approach using only dplyr (if the number of non-blank values are the same across each groups) would be to group by 'V1', 'V2', and use summarise_each to select only the elements that are not blank (.[.!=''])
mydata %>%
group_by(V1, V2) %>%
summarise_each(funs(.[.!='']))
# V1 V2 V3 V4 V5
#1 a b a c d
#2 x y h k e
We can also use data.table to do this. We convert the 'data.frame' to 'data.table' (setDT(mydata)), grouped by 'V1', 'V2', we loop through the other columns (lapply(.SD, ...)) and subset the elements that are not blank.
library(data.table)
setDT(mydata)[,lapply(.SD, function(x) x[x!='']) ,.(V1, V2)]
# V1 V2 V3 V4 V5
#1: a b a c d
#2: x y h k e
Similar approach using aggregate from base R is
aggregate(.~V1+V2, mydata, FUN=function(x) x[x!=''])
# V1 V2 V3 V4 V5
#1 a b a c d
#2 x y h k e
data
mydata <- structure(list(V1 = c("a", "a", "a", "x", "x"),
V2 = c("b", "b",
"b", "y", "y"), V3 = c("a", "", "", "h", ""), V4 = c("", "c",
"", "", "k"), V5 = c("", "", "d", "", "e")), .Names = c("V1",
"V2", "V3", "V4", "V5"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5"))

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Collapse columns into a new variable - r

Related

Columns as named list

How to reshape scanned input of pairs of data from data set with unequal numbers of columns

subset of R undefined columns

Column names into data frame

how to delete duplicated duplicated of each column after group_by() using `dplyr` package

Categories

Resources