merge messy dataframes r - r

I have 2 data frames
df1=data.frame(Col1=c('2','4','CN','CANADA',NA),Col2=c('s1','s2','s3','s4','s5'))
> df1
Col1 Col2
1 2 s1
2 4 s2
3 CN s3
4 CANADA s4
5 <NA> s5
df2=data.frame(index=1:5,code=c('AB','CA','US','CN','UK'),name=c('ALBERTA','CANADA','USA','CHINA','UK'),REGION=c('NA','NA','NA','FE','EU'))
> df2
index code name REGION
1 1 AB ALBERTA NA
2 2 CA CANADA NA
3 3 US USA NA
4 4 CN CHINA FE
5 5 UK UK EU
I want
df3=data.frame(df1,code=c('CA','CN','CN','CA',NA),name=c('CANADA','CHINA','CHINA','CANADA',NA),REGION=c('NA','FE','FE','NA',NA))
Col1 Col2 code name REGION
1 2 s1 CA CANADA NA
2 4 s2 CN CHINA FE
3 CN s3 CN CHINA FE
4 CANADA s4 CA CANADA NA
5 <NA> s5 <NA> <NA> <NA>
I have calling it by values:
df1$code=df2[df2$index[df1$Col1],2]
which fills it in incorrectly, and merging twice
m1=merge(df1,df2,by.x='Col1',by.y='index',all.x=TRUE)
m2=merge(m1,df2,by.x='Col1',by.y='name',all.x=1)
I am sure I am missing something here. Thanks for your help

Maybe not a very nice solution but it works for this example:
ind <- sapply(df1$Col1, function(x)which(df2[,c("index", "code", "name")] == as.character(x),arr.ind = T)[1])
cbind(df1, df2[ind,])
Col1 Col2 index code name REGION
2 2 s1 2 CA CANADA NA
4 4 s2 4 CN CHINA FE
4.1 CN s3 4 CN CHINA FE
2.1 CANADA s4 2 CA CANADA NA
NA <NA> s5 NA <NA> <NA> <NA>

As far as I understand the problem, Col1 of df1 contains mixed information. So my approach would be to separate the different data types. Then it should be easy to merge correctly.
chr <- as.character(df1$Col1)
index_df1 <- chr
index_df1[!grepl("^[0-9]*$", chr)] <- NA
index_df1 <- as.numeric(index_df1)
code_df1 <- chr
code_df1[!grepl("^[A-Z]{2}$", chr)] <- NA
name_df1 <- chr
name_df1[!grepl("^[A-Z]{3,}$", chr)] <- NA
df1 <- data.frame(df1, index_df1, code_df1, name_df1)

Related

Merging data frames of different lengths without unique keys

I am trying to merge two data frames of different lengths without using a unique key.
For example:
Name <- c("Steve","Peter")
Age <- c(10,20)
df1 <- data.frame(Name,Age)
> df1
Name Age
1 Steve 10
2 Peter 20
Name <-c("Jason","Nelson")
School <-c("xyz","abc")
df2 <- data.frame(Name,School)
> df2
Name School
1 Jason xyz
2 Nelson abc
I want to join these two tables so that I have all columns and have NA cells for rows that didn't have that column originally. It should look something like this:
Name Age School
1 Steve 10 <NA>
2 Peter 20 <NA>
3 Jason NA xyz
4 Nelson NA abc
thank you in advance!
dplyr::bind_rows(df1,df2)
# Warning in bind_rows_(x, .id) :
# Unequal factor levels: coercing to character
# Warning in bind_rows_(x, .id) :
# binding character and factor vector, coercing into character vector
# Warning in bind_rows_(x, .id) :
# binding character and factor vector, coercing into character vector
# Name Age School
# 1 Steve 10 <NA>
# 2 Peter 20 <NA>
# 3 Jason NA xyz
# 4 Nelson NA abc
You can alleviate some of this by pre-assigning unrecognized columns, which also works well with base R:
df2 <- cbind(df2, df1[NA,setdiff(names(df1), names(df2)),drop=FALSE])
df1 <- cbind(df1, df2[NA,setdiff(names(df2), names(df1)),drop=FALSE])
df1
# Name Age School
# NA Steve 10 <NA>
# NA.1 Peter 20 <NA>
df2
# Name School Age
# NA Jason xyz NA
# NA.1 Nelson abc NA
# ensure we use the same column order for both frames
nms <- names(df1)
rbind(df1[,nms], df2[,nms])
# Name Age School
# NA Steve 10 <NA>
# NA.1 Peter 20 <NA>
# NA1 Jason NA xyz
# NA.11 Nelson NA abc

Complex join of longitudinal tables in R

I have ~16 .txt files that I need to turn into one, wide flat file. For each new file, time has passed, and some new variables are added. What I would like to do is append those new columns to the right side of the first table, joining by an identification variable. This gets complicated quickly, so here is an MRE:
library(dplyr)
id <- as.character(1:6)
first <- c("jeff", "jimmy", "andrew", "taj", "karl-anthony", "jamal")
last <- c("teague", "butler", "wiggins", "gibson", "towns", "crawford")
set.seed(1839)
a <- c(1:4, NA, NA)
b <- c(1:4, NA, NA)
c <- c(11:13, NA, 14, NA)
d <- c(11:13, NA, 14, NA)
e <- c(21, 22, NA, 24, NA, 26)
f <- c(21, 22, NA, 24, NA, 26)
Simulating the three different files:
df_1 <- data.frame(
id = id[c(1:3,5)],
first = first[c(1:3,5)],
last = last[c(1:3,5)],
a = a[c(1:3,5)],
b = b[c(1:3,5)]
)
df_2 <- data.frame(
id = id[c(1:3,5)],
first = first[c(1:3,5)],
last = last[c(1:3,5)],
c = c[c(1:3,5)],
d = d[c(1:3,5)]
)
df_3 <- data.frame(
id = id[c(1,2,4,6)],
first = first[c(1,2,4,6)],
last = last[c(1,2,4,6)],
e = e[c(1,2,4,6)],
f = f[c(1,2,4,6)]
)
df_goal <- data.frame(id, first, last, a, b, c, d, e, f)
df_goal is what I want, and here is what it looks like:
> df_goal
id first last a b c d e f
1 1 jeff teague 1 1 11 11 21 21
2 2 jimmy butler 2 2 12 12 22 22
3 3 andrew wiggins 3 3 13 13 NA NA
4 4 taj gibson 4 4 NA NA 24 24
5 5 karl-anthony towns NA NA 14 14 NA NA
6 6 jamal crawford NA NA NA NA 26 26
Note that these are very big files, and the columns are not always in the right order, so I cannot just say to join by keeping the first three columns.
If I do a full_join on all, I get the names repeated every time:
df_all <- df_1 %>%
full_join(df_2, by = "id") %>%
full_join(df_3, by = "id")
> df_all
id first.x last.x a b first.y last.y c d first last e f
1 1 jeff teague 1 1 jeff teague 11 11 jeff teague 21 21
2 2 jimmy butler 2 2 jimmy butler 12 12 jimmy butler 22 22
3 3 andrew wiggins 3 3 andrew wiggins 13 13 <NA> <NA> NA NA
4 5 karl-anthony towns NA NA karl-anthony towns 14 14 <NA> <NA> NA NA
5 4 <NA> <NA> NA NA <NA> <NA> NA NA taj gibson 24 24
6 6 <NA> <NA> NA NA <NA> <NA> NA NA jamal crawford 26 26
What I tried to do next. I wrote a for loop, and I got each data frame, selected just (a) the id column, and (b) columns whose names have not appeared in the df_all data frame yet, and (c) did a full_join:
dfs <- c("df_2", "df_3")
df_all1 <- df_1
for (i in dfs) {
df_all1 <- get(i)[!names(get(i)) %in% names(df_all1)[-1]] %>%
full_join(df_all1, .)
}
> df_all1
id first last a b c d e f
1 1 jeff teague 1 1 11 11 21 21
2 2 jimmy butler 2 2 12 12 22 22
3 3 andrew wiggins 3 3 13 13 NA NA
4 5 karl-anthony towns NA NA 14 14 NA NA
5 4 <NA> <NA> NA NA NA NA 24 24
6 6 <NA> <NA> NA NA NA NA 26 26
Note that this means the cases that did not appear in the first file are missing the names (these represent key demographic variables in my data). I also tried going through row-by-row and doing a column join if the id was already present, and then doing a bind_row if it was not. This code threw an error:
df_all2 <- df_1
for (i in dfs) {
for (k in 1:nrow(get(i))) {
if (get(i)[k, "id"] %in% df_all2$id) {
df_all2 <- get(i)[k, !names(get(i)) %in% names(df_all2)[-1]] %>%
left_join(df_all2, ., by = "id")
} else {
df_all2 <- bind_rows(
df_all2,
get(i)[k, !names(get(i)) %in% names(df_all2)[-1]]
)
}
}
}
There has got to be a way to do a join with only select columns, but fill in missing information if necessary. Again, I am working with lots of files with lots of columns, so I cannot assume that I know the position of any columns; it has to be done by the column names.
I have also thought about just including a new variable that is the date of the file, stacking them all on top of one another ("long" format), and then using tidyr::spread and tidyr::gather, but I haven't found a solution yet.
I am not wedded to the tidyverse (base or data.table would be great, even some way to do a SQL join in R) or even R; I am open to a Python solution using pandas, as well.
Short version: How do I join new columns to an existing data set—by identification number—and fill in information from not-new columns, but since the case is new, need to be filled in?
Possible solution, per Psidom:
df_all1 <- df_1
for (i in dfs) {
df_all1 <- get(i) %>%
full_join(
df_all1, .,
by = names(get(i))[names(get(i)) %in% names(df_all1)]
)
}
df_all1
Maybe a more efficient way to do this, though?
Using melt once you have a full_join df_all.
library(data.table)
df <- melt(setDT(df_all),
measure.vars = patterns("^first", "^last"))
df <- unique(df[,-c("id", "variable")])
df[!is.na(df$value1),]
a b c d e f value1 value2
1: 1 1 11 11 21 21 jeff teague
2: 2 2 12 12 22 22 jimmy butler
3: 3 3 13 13 NA NA andrew wiggins
4: NA NA 14 14 NA NA karl-anthony towns
5: NA NA NA NA 24 24 taj gibson
6: NA NA NA NA 26 26 jamal crawford
The most simple solution using dplyr is to omit the by parameter in the calls to full_join().
library(dplyr)
df_1 %>%
full_join(df_2) %>%
full_join(df_3)
Joining, by = c("id", "first", "last")
Joining, by = c("id", "first",
"last")
id first last a b c d e f
1 1 jeff teague 1 1 11 11 21 21
2 2 jimmy butler 2 2 12 12 22 22
3 3 andrew wiggins 3 3 13 13 NA NA
4 5 karl-anthony towns NA NA 14 14 NA NA
5 4 taj gibson NA NA NA NA 24 24
6 6 jamal crawford NA NA NA NA 26 26
Warning messages:
1: Column id joining factors with different levels, coercing to character vector
2: Column first joining factors with different levels, coercing to character vector
3: Column last joining factors with different levels, coercing to character vector
The documentation of the by parameter in ?full_join says: If NULL, the default, *_join() will do a natural join, using all variables with common names across the two tables.
So this is equivivalent to explicetely passing by = c("id", "first", "last") as proposed by Psidom.
If there are many data frames to join, the code below may save a lot of typing:
Reduce(full_join, list(df_1, df_2, df_3))
The result (inluding messages) is the same as above.

Delete consecutive empty rows in R

df presents possible name matches. Each pair of matches should be divided by an empty row. However, in some cases my output includes several empty rows between the matching pairs:
> df <- data.frame(id = c(1,2,NA,3,4,NA,NA,NA,5,6,NA), name = c("john jones", "john joners",
NA, "clara prat", "klara prat", NA, NA, NA, "alan turing", "allan turing",
NA), stringsAsFactors = F)
> df
id name
1 1 john jones
2 2 john joners
3 NA <NA>
4 3 clara prat
5 4 klara prat
6 NA <NA>
7 NA <NA>
8 NA <NA>
9 5 alan turing
10 6 allan turing
11 NA <NA>
The desired output is:
> df
id name
1 1 john jones
2 2 john joners
3 NA <NA>
4 3 clara prat
5 4 klara prat
6 NA <NA>
7 5 alan turing
8 6 allan turing
9 NA <NA>
I can do this with a for loop, which I understand is less than optimal.
Perhaps this helps
v1 <- rowSums(!is.na(df))
df[unlist(lapply(split(seq_along(v1),
cumsum(c(1, diff(!v1))<0)), function(i)
i[seq(which.max(v1[i]==0))])),]
# id name
#1 1 john jones
#2 2 john joners
#3 NA <NA>
#4 3 clara prat
#5 4 klara prat
#6 NA <NA>
#9 5 alan turing
#10 6 allan turing
#11 NA <NA>
Here is another approach using rle to look for runs of missing
miss <- rowSums(is.na(df))
# get runs of missing
r <- rle(miss)
r$values <- seq_along(r$values)
# subset data, removing rows when all columns are missing
# and rows sequentially missing
df[!(miss == ncol(df) & duplicated(inverse.rle(r))), ]
# id name
# 1 1 john jones
# 2 2 john joners
# 3 NA <NA>
# 4 3 clara prat
# 5 4 klara prat
# 6 NA <NA>
# 9 5 alan turing
# 10 6 allan turing
# 11 NA <NA>
As mentioned by Akrun, you can use data.table::rleid to avoid some of the explicit rle calculations
df[!(rowSums(is.na(df)) == ncol(df) & duplicated(data.table::rleid(is.na(df[[1]])))) , ]
Using the IRanges package.
df <- data.frame(id = c(1,2,NA,3,4,NA,NA,NA,5,6,NA), name = c("john jones", "john joners",
NA, "clara prat", "klara prat", NA, NA, NA, "alan turing", "allan turing",
NA), stringsAsFactors = F)
library(IRanges)
na.rs <- which(is.na(df$id) & is.na(df$name))
na.rs.re <- reduce(IRanges(na.rs, na.rs))
na.rs.rm <- na.rs.re[width(na.rs.re)>1]
start(na.rs.rm) <- start(na.rs.rm) + 1
df[-as.integer(na.rs.rm), ]
# id name
# 1 1 john jones
# 2 2 john joners
# 3 NA <NA>
# 4 3 clara prat
# 5 4 klara prat
# 6 NA <NA>
# 9 5 alan turing
# 10 6 allan turing
# 11 NA <NA>
Surely not the best solution but easy to follow..
miss <- rowSums(is.na(df))
r <- sum(rle(miss)[[2]])
for(i in 2:length(df$id)){
while(is.na(df$id[i-1]) & is.na(df$id[i])){
df <- df[-(i),]
if(sum(is.na(df$id)) == r) break
}
}

Converting data to a 2 column format?

If I have a dataset like the following:
LA NY MA
1 2 3
4 5 6
3 5
4
(In other words, each row has a different structure. LA has 3 values, NY has 4 values, etc.)
I am trying to use lm to perform an ANOVA test (to decide whether the mean number is the same in each state), and it keeps showing "an error occurred" because rows do not match. One idea I got was to convert data to a 2-column format. Which command/package should I use to perform that task?
Edit: the data is from the txt file.
Another option after you read the file to convert to a 2-column format would be
df <- read.table("Betty.txt", header=TRUE, fill=TRUE, sep="\t")
## (as #Richard Scriven mentioned in the comment)
na.omit(stack(df))
# values ind
#1 1 LA
#2 4 LA
#3 3 LA
#5 2 NY
#6 5 NY
#7 5 NY
#8 4 NY
#9 3 MA
#10 6 MA
Update
The above I got by transforming the data to have \t delimiter. But, if the file is copy/pasted directly from the OP's post without any change (making sure that there are spaces for the 3rd and 4th row after the 2nd column)
lines <- readLines('Betty1.txt')
lines2 <- gsub("(?<=[^ ]) +|^[ ]+(?<=[ ])(?=[^ ])", ",", lines, perl=TRUE)
lines2
#[1] "LA,NY,MA" "1,2,3" "4,5,6" "3,5," ",4,"
df1 <- read.table(text=lines2, sep=',', header=TRUE)
df1
# LA NY MA
#1 1 2 3
#2 4 5 6
#3 3 5 NA
#4 NA 4 NA
and then do
na.omit(stack(df1))
Update2
Another option if you have fixed width columns is to use read.fwf
df <- read.fwf('Betty1.txt', widths=c(3,3,3), skip=1)
colnames(df) <- scan('Betty1.txt', nlines=1, what="", quiet=TRUE)
df
# LA NY MA
#1 1 2 3
#2 4 5 6
#3 3 5 NA
#4 NA 4 NA
library(tidyr)
gather(df, Var, Val, LA:MA, na.rm=TRUE)
# Var Val
#1 LA 1
#2 LA 4
#3 LA 3
#4 NY 2
#5 NY 5
#6 NY 5
#7 NY 4
#8 MA 3
#9 MA 6
Just add an 'NA' to the 4th line of your text and try:
> ddf = read.table(text="
+ LA NY MA
+ 1 2 3
+ 4 5 6
+ 3 5
+ NA 4
+ ", header=T, fill=T)
>
> ddf
LA NY MA
1 1 2 3
2 4 5 6
3 3 5 NA
4 NA 4 NA
>
> dput(ddf)
structure(list(LA = c(1L, 4L, 3L, NA), NY = c(2L, 5L, 5L, 4L),
MA = c(3L, 6L, NA, NA)), .Names = c("LA", "NY", "MA"), class = "data.frame", row.names = c(NA,
-4L))
>
> mm = melt(ddf)
No id variables; using all as measure variables
>
> mm
variable value
1 LA 1
2 LA 4
3 LA 3
4 LA NA
5 NY 2
6 NY 5
7 NY 5
8 NY 4
9 MA 3
10 MA 6
11 MA NA
12 MA NA
>
> with(mm, aov(value~variable))
Call:
aov(formula = value ~ variable)
Terms:
variable Residuals
Sum of Squares 4.833333 15.166667
Deg. of Freedom 2 6
Residual standard error: 1.589899
Estimated effects may be unbalanced
3 observations deleted due to missingness

matching only if target 'cell' is NA

I have the following two data.frames opcat and polity.
opcat <- data.frame(country = rep(LETTERS[1:5]), date.ratification = c(2003,2004,2005,NA,NA), date.accession = c(NA,NA,NA,2000,2006))
opcat
polity <- data.frame(year = rep((2000:2007), 7), country = rep(LETTERS[1:7],8), polity.score = sample(10, 56, replace=TRUE))
polity <- polity[order(polity$country, polity$year),]
polity
I want to insert the polity.score of the dateframe polity to the data.frame opcat for the year in which a country 'ratified' (= date.ratified) or 'acceeded' (= date.accession).
for ratification
opcat$polity.score <- polity$polity.score[match(interaction(opcat$country, opcat$date.ratification), interaction(polity$country, polity$year))]
opcat
country date.ratification date.accession polity.score
1 A 2003 NA 10
2 B 2004 NA 2
3 C 2005 NA 10
4 D NA 2000 NA
5 E NA 2006 NA
for accesssion
opcat$polity.score <- polity$polity.score[match(interaction(opcat$country, opcat$date.accession), interaction(polity$country, polity$year))]
opcat
country date.ratification date.accession polity.score
1 A 2003 NA NA
2 B 2004 NA NA
3 C 2005 NA NA
4 D NA 2000 9
5 E NA 2006 7
A country has either a date for ratification or for accession (not both). Since the matching for accession would fill the results for ratification with NA, I tried the following modification:
opcat$polity.score[is.na(opcat$date.ratification)] <- polity$polity.score[match(interaction(opcat$country, opcat$date.accession), interaction(polity$country, polity$year))]
opcat
But this doesn't work. I get the error message " number of items to replace is not a multiple of replacement length". How can I match the scores into the same variable without overwriting them?
The final result should be
country date.ratification date.accession polity.score
1 A 2003 NA 10
2 B 2004 NA 2
3 C 2005 NA 10
4 D NA 2000 9
5 E NA 2006 7
I would think that this shouldn't be that difficult.
Many thanks.
Add the year to opcat, and perform a standard merge:
opcat$year <- with(opcat, ifelse(is.na(date.ratification), date.accession, date.ratification))
merge(opcat,polity)

Resources