Restructuring data from long to wide by removing characters - r

Here is a sample of my data
code group type outcome
11 A red M*P
11 N orange N*P
11 Z red R
12 AB A blue Z*P
12 AN B green Q*P
12 AA A gray AB
which can be created by:
df <- data.frame(
code = c(rep(11,3), rep(12,3)),
group = c("A", "N", "Z", "AB A", "AN B", "AA A"),
type = c("red", "orange", "red", "blue", "green", "gray"),
outcome = c("M*P", "N*P", "R", "Z*P", "Q*P", "AB"),
stringsAsFactors = FALSE
)
I want to get the following table
code group1 group2 group3 type1 type2 type3 outcome
11 A N Z red orange red MNR
12 AB A AN B AA A blue green gray ZQAB
I have used the following code, but it does not work. I want to remove Ps in outcome. Thanks for your help.
dcast(df, formula= code +group ~ type, value.var = 'outcome')

Using data.table to hit your expected output:
library(data.table)
setDT(df)
# Clean out the Ps before hand
df[, outcome := gsub("*P", "", outcome, fixed = TRUE)]
# dcast but lets leave the outcome for later... (easier)
wdf <- dcast(df, code ~ rowid(code), value.var = c('group', 'type'))
# Now outcome maneuvering separately by code and merge
merge(wdf, df[, .(outcome = paste(outcome, collapse = "")), code])
code group_1 group_2 group_3 type_1 type_2 type_3 outcome
1: 11 A N Z red orange red MNR
2: 12 AB A AN B AA A blue green gray ZQAB

Related

How to filter values in a list within a dataframe in R?

I have a dataframe, df:
df <- structure(list(id = c("id1", "id2", "id3",
"id4"), type = c("blue", "blue", "brown", "blue"
), value = list(
value1 = "cat", value2 = character(0),
value3 = "dog", value4 = "fish")), row.names = 1:4, class = "data.frame")
> df
id type value
1 id1 blue cat
2 id2 blue
3 id3 brown dog
4 id4 blue fish
The third column, value, is a list. I want to be able to filter out any rows in the dataframe where entries in that column that don't have any characters (ie. the second row).
I've tried this:
df <- filter(df, value != "")
and this
df <- filter(df, nchar(value) != 0)
But it doesn't have any effect on the data frame. What is the correct way to do this so my data frame looks like this:
> df
id type value
1 id1 blue cat
3 id3 brown dog
4 id4 blue fish
The lengths() function is perfect here - it gives the length of each element of a list. You want all the rows where value has non-zero length:
df[lengths(df$value) > 0, ]
# id type value
# 1 id1 blue cat
# 3 id3 brown dog
# 4 id4 blue fish
here is my approach
idx <- lapply(df$value, length)
filter(df, idx > 0)
id type value
1 id1 blue cat
2 id3 brown dog
3 id4 blue fish
An option with tidyverse
library(dplyr)
library(purrr)
df %>%
filter(map_int(value, length) > 0)
# id type value
#1 id1 blue cat
#2 id3 brown dog
#3 id4 blue fish
Try this:
df <- filter(df, !sapply(df$value,function(x) identical(x,character(0))) )

Staking multiple columns to two columns and removing duplicates in R

I have multiple columns, but here is only a part of my data:
df<-read.table (text=" Color1 Size1 Color2 Size2 Color3 Size3
Yellow AA Gray GB Purpul MO
Blue BD Cyne CE Gray GB
Yellow AA Yellow AA Black LL
Red MD Reddark KK Reddark KK
Green MC Reddark KK Green MC
", header=TRUE)
I want to bring down all the columns and show them as two columns and then remove duplicates to get this table:
Color Size
Yellow AA
Blue BD
Red MD
Green MC
Gray GB
Cyne CE
Reddark KK
Purpul MO
Black LL
I try Reshape2 using melt, but I struggled to do it.
With no other libraries, reshape and unique can get the job done:
> unique(reshape(df, varying=1:6, direction="long", v.names=c("Color", "Size"), timevar=NULL)[1:2])
Color Size
1.1 Yellow AA
2.1 Blue BD
4.1 Red MD
5.1 Green MC
1.2 Gray GB
2.2 Cyne CE
4.2 Reddark KK
1.3 Purpul MO
3.3 Black LL
Pivoting seems like overkill to me, but what do I know. If the index bothers you (though it saves the information on how the wide table was structured) then reset the row names:
> uniq = unique(reshape(df, varying=1:6, direction="long", v.names=c("Color", "Size"), timevar=NULL)[1:2])
> rownames(uniq) = NULL
Another way of using pivot_longer() and pivot_wider() can be:
library(dplyr)
library(tidyr)
#Code
newdf <- df %>%
pivot_longer(everything()) %>%
mutate(name=substr(name,1,nchar(name)-1)) %>%
group_by(name) %>% mutate(id2=row_number()) %>%
pivot_wider(names_from = name,values_from=value) %>%
select(-id2) %>%
filter(!duplicated(paste(Color,Size)))
Output:
# A tibble: 9 x 2
Color Size
<fct> <fct>
1 Yellow AA
2 Gray GB
3 Purpul MO
4 Blue BD
5 Cyne CE
6 Black LL
7 Red MD
8 Reddark KK
9 Green MC
We can use pivot_longer from tidyr to reshape from 'wide' to 'long' in two columns by specifying the names_sep as the boundary between a letter and a digit ((?<=[a-z])(?=\\d)) in the column names and then take the distinct of the two columns
library(dplyr)
library(tidyr)
pivot_longer(df, cols = everything(),
names_to = c( '.value', 'grp'), names_sep="(?<=[a-z])(?=\\d)") %>%
distinct(Color, Size)
-output
# A tibble: 9 x 2
# Color Size
# <chr> <chr>
#1 Yellow AA
#2 Gray GB
#3 Purpul MO
#4 Blue BD
#5 Cyne CE
#6 Black LL
#7 Red MD
#8 Reddark KK
#9 Green MC
Or using data.table
library(data.table)
unique(melt(setDT(df), measure = patterns('^Color', '^Size'),
value.name = c('Color', 'Size'))[, variable := NULL])
# Color Size
#1: Yellow AA
#2: Blue BD
#3: Red MD
#4: Green MC
#5: Gray GB
#6: Cyne CE
#7: Reddark KK
#8: Purpul MO
#9: Black LL
data
df <- structure(list(Color1 = c("Yellow", "Blue", "Yellow", "Red",
"Green"), Size1 = c("AA", "BD", "AA", "MD", "MC"), Color2 = c("Gray",
"Cyne", "Yellow", "Reddark", "Reddark"), Size2 = c("GB", "CE",
"AA", "KK", "KK"), Color3 = c("Purpul", "Gray", "Black", "Reddark",
"Green"), Size3 = c("MO", "GB", "LL", "KK", "MC")),
class = "data.frame", row.names = c(NA,
-5L))

R append 2 data frames with different columns

I would like to append dfToAdd to df, where the first has missing columns. Important detail is that df has 2 types of columns. 1st set of columns are correlating with each other.
e.g. group="A" means name="Group A" and color="Blue". There can't be a combination of A-Group A-Red.
2nd type of columns are correlating among themselves.
animal="Dog" action="Bark"
And I would like to add this second data frame with missing columns of the first type of columns. Those columns should be filled with combinations of the first type of columns like the following dfResult (order of rows don't matter):
df = data.frame(group = c("A", "A", "A", "B", "B", "B"),
name = c("Group A", "Group A", "Group A", "Group B", "Group B", "Group B"),
color = c("Blue", "Blue", "Blue", "Red", "Red", "Red"),
animal = c("Dog", "Cat", "Mouse", "Dog", "Cat", "Mouse"),
action = c("Bark", "Meow", "Squeak", "Bark", "Meow", "Squeak")
)
dfToAdd = data.frame(animal = c("Lion", "Bird"),
action = c("Roar", "Chirp"))
dfResult = data.frame(group = c("A", "A", "A", "B", "B", "B", "A", "A", "B", "B"),
name = c("Group A", "Group A", "Group A", "Group B", "Group B", "Group B", "Group A", "Group A", "Group B", "Group B"),
color = c("Blue", "Blue", "Blue", "Red", "Red", "Red", "Blue", "Blue", "Red", "Red"),
animal = c("Dog", "Cat", "Mouse", "Dog", "Cat", "Mouse", "Lion", "Bird", "Lion", "Bird"),
action = c("Bark", "Meow", "Squeak", "Bark", "Meow", "Squeak", "Roar", "Chirp", "Roar", "Chirp"))
> df
group name color animal action
1 A Group A Blue Dog Bark
2 A Group A Blue Cat Meow
3 A Group A Blue Mouse Squeak
4 B Group B Red Dog Bark
5 B Group B Red Cat Meow
6 B Group B Red Mouse Squeak
> dfToAdd
animal action
1 Lion Roar
2 Bird Chirp
> dfResult
group name color animal action
1 A Group A Blue Dog Bark
2 A Group A Blue Cat Meow
3 A Group A Blue Mouse Squeak
4 B Group B Red Dog Bark
5 B Group B Red Cat Meow
6 B Group B Red Mouse Squeak
7 A Group A Blue Lion Roar
8 A Group A Blue Bird Chirp
9 B Group B Red Lion Roar
10 B Group B Red Bird Chirp
But the 1st type of columns (group, name, color) is not completely known. I am working with multiple grouping variables of an arbitrary number. You can imagine that there may or may not be be a description column="Group A is a good group" or date="2020.04.13". We only know for sure the columns of the second type: animal and action.
We could do this in a single %>% by sliceing the first row from 'df', select the columns that are not the ones in 'dfToAdd', bind that with the 'dfToAdd', then do the row bind with 'df' and use complete
library(dplyr)
library(tidyr)
library(rlang)
library(purrr)
df %>%
slice(1) %>%
select(-names(dfToAdd)) %>%
uncount(nrow(dfToAdd)) %>%
bind_cols(dfToAdd) %>%
bind_rows(df, .) %>%
complete(nesting(!!! syms(names(dfToAdd))),
nesting(!!! syms(setdiff(names(.), names(dfToAdd)))))
# A tibble: 10 x 5
# animal action group name color
# * <fct> <fct> <fct> <fct> <fct>
# 1 Cat Meow A Group A Blue
# 2 Cat Meow B Group B Red
# 3 Dog Bark A Group A Blue
# 4 Dog Bark B Group B Red
# 5 Mouse Squeak A Group A Blue
# 6 Mouse Squeak B Group B Red
# 7 Bird Chirp A Group A Blue
# 8 Bird Chirp B Group B Red
# 9 Lion Roar A Group A Blue
#10 Lion Roar B Group B Red
While writing this I had the idea to use [nesting][1] on both sides of [complete][2] function of tidyr and detect missing columns manually (maybe there is a more elegant solution):
# First find all grouping columns
groupCols = colnames(df)[!(colnames(df) %in% colnames(dfToAdd))]
otherCols = colnames(df)[colnames(df) %in% colnames(dfToAdd)]
# Populate missing columns with first grouping appearing in the df
dfToAdd[groupCols] = df[1, groupCols]
# rbind it to append
dfResult = rbind(df, dfToAdd)
# Now we have obvious missing combinations, tidyr::complete accepts nesting information to generate combinations only for those, which needs to be different.
dfResult %>% tidyr::complete(tidyr::nesting(!!! syms(otherCols)), tidyr::nesting(!!! syms(groupCols)))
edit: actually realized that I am using unknown column names at the end. This doesn't work really. I need to feed groupCols (character vector) to second nesting call.
edit2: now thanks to akrun's answer, I can correct this one too.

Transforming a dataframe in r to apply pivot table

I have a data frame like below:
Red Green Black
John A B C
Sean A D C
Tim B C C
How can I transform it to below form to apply a pivot table (or if it can be done directly in r without transforming data):
Names Code Type
John Red A
John Green B
John Black C
Sean Red A
Sean Green D
Sean Black C
Tim Red B
Tim Green C
Tim Black C
So then my ultimate goal is to count the types as below by a pivot table on the transformed dataframe:
Count of Code for each type:
Row Labels A B C D Grand Total
John 1 1 1 3
Sean 1 1 1 3
Tim 1 2 3
Grand Total 2 2 4 1 9
```
reading similar topics did not help that much.
Thanks in advance!
Regards
Using a literal dump from your first matrix-like frame above:
dat <- structure(list(Red = c("A", "A", "B"), Green = c("B", "D", "C"
), Black = c("C", "C", "C")), class = "data.frame", row.names = c("John",
"Sean", "Tim"))
I can do this:
library(dplyr)
library(tidyr)
tibble::rownames_to_column(dat, var = "Names") %>%
gather(Code, Type, -Names)
# Names Code Type
# 1 John Red A
# 2 Sean Red A
# 3 Tim Red B
# 4 John Green B
# 5 Sean Green D
# 6 Tim Green C
# 7 John Black C
# 8 Sean Black C
# 9 Tim Black C
We can extend that to get your next goal:
tibble::rownames_to_column(dat, var = "Names") %>%
gather(Code, Type, -Names) %>%
xtabs(~ Names + Type, data = .)
# Type
# Names A B C D
# John 1 1 1 0
# Sean 1 0 1 1
# Tim 0 1 2 0
which then just needs marginals:
tibble::rownames_to_column(dat, var = "Names") %>%
gather(Code, Type, -Names) %>%
xtabs(~ Names + Type, data = .) %>%
addmargins()
# Type
# Names A B C D Sum
# John 1 1 1 0 3
# Sean 1 0 1 1 3
# Tim 0 1 2 0 3
# Sum 2 2 4 1 9
You can use reshape(). I'm not sure about your data structure, if there is a column with names or if they are row names. I've added both versions.
reshape(dat1, idvar="Names",
varying=2:4,
v.names="Type", direction="long",
timevar="Code", times=c("red", "green", "black"),
new.row.names=1:9)
reshape(transform(dat2, Names=rownames(dat2)), idvar="Names",
varying=1:3,
v.names="Type", direction="long",
timevar="Code", times=c("red", "green", "black"),
new.row.names=1:9)
# V1 Code Type
# 1 John red A
# 2 Sean red A
# 3 Tim red B
# 4 John black B
# 5 Sean black D
# 6 Tim black C
# 7 John green C
# 8 Sean green C
# 9 Tim green C
To get kind of a raw version you could do:
res <- reshape(transform(dat2, Names=rownames(dat2)), idvar="Names",
varying=1:3,
v.names="Type", direction="long",
timevar="Code")
res
# Names Code Type
# John.1 John 1 A
# Sean.1 Sean 1 A
# Tim.1 Tim 1 B
# John.2 John 2 B
# Sean.2 Sean 2 D
# Tim.2 Tim 2 C
# John.3 John 3 C
# Sean.3 Sean 3 C
# Tim.3 Tim 3 C
After that you may assign labels at will to "Code" column by transforming to factor like so:
res$Code <- factor(res$Code, labels=c("red", "green", "black"))
Data
dat1 <- structure(list(Names = c("John", "Sean", "Tim"), Red = c("A",
"A", "B"), Green = c("B", "D", "C"), Black = c("C", "C", "C")), row.names = c(NA,
-3L), class = "data.frame")
dat2 <- structure(list(Red = c("A", "A", "B"), Green = c("B", "D", "C"
), Black = c("C", "C", "C")), row.names = c("John", "Sean", "Tim"
), class = "data.frame")
What you aim to do is (1) creating a contingency table and then (2) compute the sum of table entries for both rows and columns.
Step1: Create a contingency table
I first pivoted the data using pivot_longer() rather than gather() because it's more intuitive. Then, apply table() to the two variables of your interest.
# Toy example
df <- structure(list(Red = c("A", "A", "B"), Green = c("B", "D", "C"
), Black = c("C", "C", "C")), class = "data.frame", row.names = c("John",
"Sean", "Tim"))
# Pivot the data
long_df <- tibble::rownames_to_column(df, var = "Names") %>%
tidyverse::pivot_longer(cols = c(-Names),
names_to = "Type",
values_to = "Code")
# Create a contingency table
df_table <- table(long_df$Names, long_df$Code)
Step 2: Compute the sum of entries for both rows and columns.
Again, I only used a base R function margin.table(). Using this approach also allows you to save the sum of the row and column entries for further analysis.
# Grand total (margin = 1 indicates rows)
df_table %>%
margin.table(margin = 1)
# Grand total (margin = 2 indicates columns)
df_table %>%
margin.table(margin = 2)

Combine two data frames across multiple columns

Say I have two dataframes, each with four columns. One column is a numeric value. The other three are identifying variables. For example:
set1 <- data.frame(label1 = c("a","b", "c"), label2 = c("red", "white", "blue"), name = c("sam", "bob", "drew"), val = c(1, 10, 100))
set2 <- data.frame(label1 = c("b","c", "d"), label2 = c("white", "green", "orange"), name = c("bob", "drew", "collin"), val = c(7, 100, 15))
Which are:
> set1
label1 label2 name val
1 a red sam 1
2 b white bob 10
3 c blue drew 50
> set2
label1 label2 name val
1 b white bob 7
2 c green drew 100
3 d orange collin 15
The first three columns can be combined to form a primary key. What is the most efficient way to combine these two data frames such that all unique values (from columns label1, label2, name) are displayed along with the two val columns:
set3 <- data.frame(label = c("a", "b", "c", "c", "d"), label2 = c("red", "white", "blue", "green", "orange"), name = c("sam", "bob", "drew", "drew", "collin"), val.set1 = c(1, 10, 50, NA, NA), val.set2 = c(NA, 7, NA, 100, 15))
> set3
label label2 name val.set1 val.set2
1 a red sam 1 NA
2 b white bob 10 7
3 c blue drew 50 NA
4 c green drew NA 100
5 d orange collin NA 15
>
When thinking of efficiency, you should evaluate the data.table package:
library(data.table)
(merge(
setDT(set1, key=names(set1)[1:3]),
setDT(set2, key=names(set2)[1:3]),
all=T,
suffixes=paste0(".set",1:2)
) -> set3)
# label1 label2 name val.set1 val.set2
# 1: a red sam 1 NA
# 2: b white bob 10 7
# 3: c blue drew 100 NA
# 4: c green drew NA 100
# 5: d orange collin NA 15
Since they're in the same format, you could just rowbind them together and then take only the unique values. Using dplyr:
bind_rows(set1, set2) %>% distinct(label1, label2, name)
You just want to make sure that you don't have factors in there, that everything is a character or numeric.

Resources