Columns as named list - r

After importing a Rdata object, I have a dataframe, in which the columns are stored as 'named list'. How do I unlist them?
structure(list(x1 = list(V1 = "1.", V2 = "2.", V3 = "3.", V4 = "4.",
V5 = "5."), company_name = list(V1 = "A", V2 = "B",
V3 = "C",
V4 = "D", V5 = "E"),
registered_office_address_commune = list(V1 = "Padova", V2 = "Padova",
V3 = "MISSING DATA", V4 = "MISSING DATA", V5 = "Padova")), row.names = c("V1",
"V2", "V3", "V4", "V5"), class = "data.frame")
glimpse(df) gives
Rows: 5
Columns: 3
$ x1 <named list> ["1.", "2.", "3.", "4.", "5."]
$ company_name <named list> ["A", "B", "C", "D", "E"]
$ registered_office_address_commune <named list> ["Padova", "Padova", "MISSING DATA", "MISSING DATA", "Padova"]
I would like to have a 'normal' dataframe, with columns saved either as numerical or character,
Can anyone help?

dat <- structure(
list(
x1 = list(
V1 = "1.", V2 = "2.", V3 = "3.", V4 = "4.", V5 = "5."
),
company_name = list(
V1 = "A", V2 = "B", V3 = "C", V4 = "D", V5 = "E"
),
registered_office_address_commune = list(
V1 = "Padova", V2 = "Padova", V3 = "MISSING DATA",
V4 = "MISSING DATA", V5 = "Padova"
)
),
row.names = c("V1", "V2", "V3", "V4", "V5"),
class = "data.frame"
)
How do I unlist them?
You could iteratively call unlist() on columns. Using base R, you can do this as
lst <- lapply(df, unlist))
which gives you a list of atomic vectors, which then can be coerced to data.frame as
res <- as.data.frame(lst)
str(res)
#> 'data.frame': 5 obs. of 3 variables:
#> $ x1 : chr "1." "2." "3." "4." ...
#> $ company_name : chr "A" "B" "C" "D" ...
#> $ registered_office_address_commune: chr "Padova" "Padova" "MISSING DATA" "MISSING DATA" ...

We can use unnest
library(dplyr)
library(tidyr)
df %>%
unnest
# A tibble: 5 x 3
# x1 company_name registered_office_address_commune
# <chr> <chr> <chr>
#1 1. A Padova
#2 2. B Padova
#3 3. C MISSING DATA
#4 4. D MISSING DATA
#5 5. E Padova

Related

Divide or split dataframe into multiple dfs based on empty row and header title

I have a dataframe which has multiple values in a single file. I want to divide it into multiple files around 25 from the file. Pattern for the file is where there is one blank row and a header title is there , it is a new df. I Have tried this Splitting dataframes in R based on empty rows but this does not take care of any blank row within the new df (V1 column 9th row). I want the data to be divided on empty row and a header title my data and code i have tried is given below . Also how can i put the header row as the Dataframe name in my newly created dfs.
df = structure(list(V1 = c("Machine", "", "Machine", "V1", "03-09-2020",
"", "Machine", "No", "Name", "a", "1", "2", "", "Machine", "No",
""), V2 = c("Data", "", "run", "V2", "600119", "", "error", "SpNo",
"", "a", "b", "c", "", "logs", "sp", ""), V3 = c("Editor", "",
"information", "V3", "6", "", "messages", "OP", "", "", "b",
"c", "", "", "op", ""), V4 = c("", "", "", "V4", "", "", "",
"OP", "", "", "", "", "", "", "name", "")), class = "data.frame", row.names = c(NA,
-16L))
dt <- df
## add column to indicate groups
dt$tbl_id <- cumsum(!nzchar(dt$V1)
unique(dt$tbl_id)
## remove blank lines
dt <- dt[nzchar(dt$V1), ]
## split the data frame
dt_s <- split(dt[, -ncol(dt)], dt$tbl_id)
## use first line as header and reset row numbers
dt_s <- lapply(dt_s, function(x) {
colnames(x) <- x[1, ]
x <- x[-1, ]
rownames(x) <- NULL
x
})
any help will be highly useful . Also all the header title will be same in all the files. I am using lapply for the multiple file operations.
Expected output will be :-
Machine_run_nformation <- read.table(text="
V1 V2 V3 V4
03-09-2020 600119 - 6
",header = T)
Machine_error_essages <- read.table(text="
No SpNo OP OP_Name
- - a a
1 - b b
2 - c c
",header = T)
Similar to these - there will be 25 outputs
Maybe you can try
u <- rowSums(df == "")==ncol(df)
out <- split(subset(df,!u),cumsum(u)[!u])
which gives
> out
$`0`
V1 V2 V3 V4
1 Machine Data Editor
$`1`
V1 V2 V3 V4
3 Machine run information
4 V1 V2 V3 V4
5 03-09-2020 600119 6
$`2`
V1 V2 V3 V4
7 Machine error messages
8 No SpNo OP OP
9 Name
10 a a
11 1 b b
12 2 c c
$`3`
V1 V2 V3 V4
14 Machine logs
15 No sp op name
here is an approach using dplyr::group_split (which is in an experimental lifecycle).
df = structure(list(V1 = c("Machine", "", "Machine", "V1", "03-09-2020",
"", "Machine", "No", "Name", "a", "1", "2", "", "Machine", "No",
""), V2 = c("Data", "", "run", "V2", "600119", "", "error", "SpNo",
"", "a", "b", "c", "", "logs", "sp", ""), V3 = c("Editor", "",
"information", "V3", "6", "", "messages", "OP", "", "", "b",
"c", "", "", "op", ""), V4 = c("", "", "", "V4", "", "", "",
"OP", "", "", "", "", "", "", "name", "")), class = "data.frame", row.names = c(NA,
-16L))
df %>%
dplyr::mutate(FLAG=rowSums(.=="")==ncol(.)) %>%
dplyr::mutate(GRP=cumsum(FLAG)) %>%
dplyr::filter(!FLAG) %>%
dplyr::group_by(GRP) %>%
dplyr::group_split() %>%
lapply(function(f) dplyr::select(f,-FLAG,-GRP))
[[1]]
# A tibble: 1 x 4
V1 V2 V3 V4
<chr> <chr> <chr> <chr>
1 Machine Data Editor ""
[[2]]
# A tibble: 3 x 4
V1 V2 V3 V4
<chr> <chr> <chr> <chr>
1 Machine run information ""
2 V1 V2 V3 "V4"
3 03-09-2020 600119 6 ""
[[3]]
# A tibble: 6 x 4
V1 V2 V3 V4
<chr> <chr> <chr> <chr>
1 Machine "error" "messages" ""
2 No "SpNo" "OP" "OP"
3 Name "" "" ""
4 a "a" "" ""
5 1 "b" "b" ""
6 2 "c" "c" ""
[[4]]
# A tibble: 2 x 4
V1 V2 V3 V4
<chr> <chr> <chr> <chr>
1 Machine logs "" ""
2 No sp "op" "name"

Creating a column in a dataframe with values from another dataframe in R

I have a dataframe with the following structure:
V1 V2 V3 V4 V5 V6 V7 V8
A B A B B A B B
It's only one row and each column has one letter. I wanna take these letters, two by two and insert them in another dataframe, as follows:
V1 V2 V3
X AB F
Y AB G
Z BA H
W BB I
How should I proceed? Maybe turn the first dataframe into a vector?
Thanks in advance!
I have no clue where V1 and V3 come from, but the code below can help you produce V2
do.call(rbind,
Map(function(x) do.call(paste0,x),
split.default(df1,ceiling(seq_along(df1)/2)))
)
which gives
[,1]
1 "AB"
2 "AB"
3 "BA"
4 "BB"
Data
> dput(df1)
structure(list(V1 = "A", V2 = "B", V3 = "A", V4 = "B", V5 = "B",
V6 = "A", V7 = "B", V8 = "B"), class = "data.frame", row.names = c(NA,
-1L))
Here is a more long-winded way of doing this.
xy <- structure(list(V1 = "A", V2 = "B", V3 = "A", V4 = "B", V5 = "B",
V6 = "A", V7 = "B", V8 = "B"), class = "data.frame", row.names = c(NA,
-1L))
xy <- unlist(xy) # split doesn't work on data.frames
# Make sure that length of xy is divisible by 2.
stopifnot((length(xy) %% 2) == 0)
tosplit <- rep(1:(length(xy) / 2), each = 2)
tosplit <- factor(tosplit)
xy <- split(xy, f = tosplit)
xy <- lapply(xy, FUN = paste, collapse = "")
xy <- unlist(xy)
xy
1 2 3 4
"AB" "AB" "BA" "BB"
This can easily be manipulated into a data.frame of your choosing.
We can also convert to a matrix of two columns and then do the paste
do.call(paste0, as.data.frame(matrix(unlist(df1), ncol = 2, byrow = TRUE)))
#[1] "AB" "AB" "BA" "BB"
data
df1 <- structure(list(V1 = "A", V2 = "B", V3 = "A", V4 = "B", V5 = "B",
V6 = "A", V7 = "B", V8 = "B"), class = "data.frame", row.names = c(NA,
-1L))

How to reshape scanned input of pairs of data from data set with unequal numbers of columns

I have pairs of data scanned where the number of pairs on a line is variable in number.
R code:
x <- scan(paste0(dirdata,"df.txt"), what = "", sep = " ")
Lines in data set look like
v1 v2 v3 v4 v5 v6 V7 V8
"A" "35" "B" "32" "Z" "67" "F" 17
"F" "17" NA NA NA NA NA NA
I want to reshape it to look like
v1 v2
"A" "35"
"B" "32"
"Z" "67"
"F" "17"
Can this be done in base R? How? in reshape package? How?
I am doing things two ways (1) using base R code and (2) using a package such as reshape.
Output from an unlist function is given below.
d1 <- data.frame(v1 = unlist(df1[,c(TRUE, FALSE)]),
v2 = unlist(df1[,c(FALSE, TRUE)]))
data.frame(lapply(d1, na.omit))
head(d1)
v1 v2
V21 A 1
V22 B 50
V23 Z 74
V24 F 3
Can someone explain what the unlist function is doing and is column one of the output line numbers?
Can you help, please? Thanks.
MM
If it is data.frame
d1 <- data.frame(v1 = unlist(df1[,c(TRUE, FALSE)]),
v2 = unlist(df1[,c(FALSE, TRUE)]))
data.frame(lapply(d1, na.omit))
# v1 v2
#1 A 35
#2 F 17
#3 B 32
#4 Z 67
#5 F 17
or using reshape from base R
na.omit(reshape(transform(df1, rn = seq_len(nrow(df1))),
direction = 'long', idvar = 'rn',
varying = list(seq(1, ncol(df1), by = 2), seq(2, ncol(df1), by = 2)))[3:4])
data
df1 <- structure(list(v1 = c("A", "F"), v2 = c(35L, 17L), v3 = c("B",
NA), v4 = c(32L, NA), v5 = c("Z", NA), v6 = c(67L, NA), V7 = c("F",
NA), V8 = c(17L, NA)), row.names = c(NA, -2L), class = "data.frame")

Collapse columns into a new variable

I have a data frame that looks like this.
name = c("p1","p2","p3","p4")
place = c("f","g","h","i")
v1 = c("x", "NA", "NA", "NA")
v2 = c("NA", "y", "y", "NA")
v3 = c("NA", "NA", "z", "NA")
region = c("n","w","s","e")
grade = c("f1","f2","f3","f4")
df = data.frame(name, place, v1, v2, v3, region, grade)
name place v1 v2 v3 region grade
1 p1 f x NA NA n f1
2 p2 g NA y NA w f2
3 p3 h NA y z s f3
4 p4 i NA NA NA e f4
I would like to add a new character vector v4 that contains the character from any of columns v1 v2 v3.
name place v1 v2 v3 v4 region grade
1 p1 f x NA NA x n f1
2 p2 g NA y NA y w f2
3 p3 h NA y z yz s f3
4 p4 i NA NA NA NA e f4
many thanks
We can use paste after converting the columns to character
df$V4 <- gsub("NA", "", do.call(paste, c(df[3:5], sep="")))
df$V4[df$V4==""] <- NA
df$V4
#[1] "x" "y" "yz" NA
data
df <- structure(list(name = c("p1", "p2", "p3", "p4"), place = c("f",
"g", "h", "i"), v1 = c("x", NA, NA, NA), v2 = c(NA, "y", "y",
NA), v3 = c(NA, NA, "z", NA), region = c("n", "w", "s", "e"),
grade = c("f1", "f2", "f3", "f4")), .Names = c("name", "place",
"v1", "v2", "v3", "region", "grade"), class = "data.frame",
row.names = c("1", "2", "3", "4"))
The dplyr alternative:
install.packages("dplyr")
library(dplyr)
df <- df %>%
mutate( v4 = gsub( "NA", "", paste0(v1,v2,v3) ) ) %>%
mutate( v4 = ifelse( v4 == "", NA, v4 ) )
This should work if NA/"NA" is NA (not-a-value) or "NA" (character). And if you don't care whether v4 contains "" or NA, you can leave off the last line (and delete the last pipe).

how to delete duplicated duplicated of each column after group_by() using `dplyr` package

I have a data.frame mydata like this
V1 V2 V3 V4 V5
1 a b a
2 a b c
3 a b d
4 x y h
5 x y k e
I want to group it by the columns V1and V2, and delete the "" string in the other columns
the result should like this
V1 V2 V3 V4 V5
1 a b a c d
2 x y h k e
is their a efficient way to do this using the dplyr package? Thank you very much.
Using base R, if that's of interest
x <- data.frame(V1 = c(rep("a", 3), "x", "x"),
V2 = c(rep("b", 3), "y", "y"),
V3= c("a", "", "", "h", ""),
V4 = c("", "c", "", "", "k"),
V5 = c(rep("", 2), "d", "", "e"))
temp <- lapply(x[], function(y) as.character(unique(y[y != ""])))
data.frame(do.call(cbind,temp))
V1 V2 V3 V4 V5
1 a b a c d
2 x y h k e
We can use dplyr/tidyr. We reshape the data from 'wide' to 'long' using gather, remove the blank elements in the 'Val' column with filter, and reshape it back to 'wide' format with spread.
library(dplyr)
library(tidyr)
gather(mydata, Var, Val, V3:V5) %>%
filter(Val!='') %>%
spread(Var, Val)
# V1 V2 V3 V4 V5
#1 a b a c d
#2 x y h k e
Or another approach using only dplyr (if the number of non-blank values are the same across each groups) would be to group by 'V1', 'V2', and use summarise_each to select only the elements that are not blank (.[.!=''])
mydata %>%
group_by(V1, V2) %>%
summarise_each(funs(.[.!='']))
# V1 V2 V3 V4 V5
#1 a b a c d
#2 x y h k e
We can also use data.table to do this. We convert the 'data.frame' to 'data.table' (setDT(mydata)), grouped by 'V1', 'V2', we loop through the other columns (lapply(.SD, ...)) and subset the elements that are not blank.
library(data.table)
setDT(mydata)[,lapply(.SD, function(x) x[x!='']) ,.(V1, V2)]
# V1 V2 V3 V4 V5
#1: a b a c d
#2: x y h k e
Similar approach using aggregate from base R is
aggregate(.~V1+V2, mydata, FUN=function(x) x[x!=''])
# V1 V2 V3 V4 V5
#1 a b a c d
#2 x y h k e
data
mydata <- structure(list(V1 = c("a", "a", "a", "x", "x"),
V2 = c("b", "b",
"b", "y", "y"), V3 = c("a", "", "", "h", ""), V4 = c("", "c",
"", "", "k"), V5 = c("", "", "d", "", "e")), .Names = c("V1",
"V2", "V3", "V4", "V5"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5"))

Resources