Creating a loop to add labels to colums: library(Hmisc) - r

I have a dataset which looks something like this:
Year Country Matchcode P H
1 2000 France 0001 1213 1872
2 2001 France 0002 1234 2345
3 2000 UK 0003 1726 2234
4 2001 UK 0004 6433 9082
I have another dataset which looks something like this:
Indicator Code Indicator Name
P Power
H Happiness
I would like to add info in the second column of the second dataset (Power, Happiness) as a label to the abbreviation used in the first dataset with a loop, but I don't know exactly how to write the loop.
This is how far I got:
library(Hmisc)
for i in df2[,1]{
if (df1[,i] == df2[i,]){
label(df1[,i]) <- df2[i,2]
}}
But this merely checks whether names are the same and does not search for it.
Could anyone guide further?
Desired output:
Year Country Matchcode P(label=Power) H(label=Happiness)
1 2000 France 0001 1213 1872
2 2001 France 0002 1234 2345
3 2000 UK 0003 1726 2234
4 2001 UK 0004 6433 9082

If you specifically want to use a loop, this approach gives the output you describe:
df <- data.frame(Year = c(2000, 2001, 2000, 2001),
Country = c("France", "France", "UK","UK"),
Matchcode = c("0001", "0002", "0003", "0004"),
P = c(1213, 1234, 1726, 6433),
H = c(1872, 2345, 2234, 9082))
lookup <- data.frame(code = c ("P", "H"),
label = c("Power", "Happiness"),
stringsAsFactors = FALSE)
for (i in 1:length(colnames(df))) {
if(!is.na(match(colnames(df), lookup$code)[i])) {
Hmisc::label(df[[i]]) <- lookup$label[(match(colnames(df), lookup$code))[i]]
}
}
This works:
Hmisc::label(df[4])
# P
# "Power"
It also checks out in the RStudio viewer:
Like several of the other answerers and commenters, I had originally thought you wanted to append the "label = " text to the column names. For anyone wanting that, this is the (loop) code.
for (i in 1:length(colnames(df))) {
if(!is.na(match(colnames(df), lookup$code)[i])) {
colnames(df)[i] <- paste0(colnames(df)[i],
"(label=",
lookup$label[(match(colnames(df), lookup$code))[i]],
")")
}
}

It's not clear to me at all what you're trying to do with Hmisc::label but I think you're misinterpreting the role & function of Hmisc::label.
Consider the following:
Let's construct a sample data.frame consisting of 2 rows and 2 columns.
df <- setNames(data.frame(matrix(0, ncol = 2, nrow = 2)), c("a", "b"))
df
# a b
#1 0 0
#2 0 0
We extract the column names. Note that cn is a character vector.
cn <- colnames(df)
cn
#[1] "a" "b"
We now set a Hmisc::label for cn.
label(cn) <- "label for cn"
cn
#label for cn
#[1] "a" "b"
We inspect the attributes of cn
attributes(cn)
#$label
#[1] "label for cn"
#
#$class
#[1] "labelled" "character"
We now assign cn to the column names of df.
colnames(df) <- cn
df
# a b
#1 0 0
#2 0 0
Note how the label attribute is not stored as part of the column names.

Here's a dplyr solution:
# example datasets
df = read.table(text = "
Year Country Matchcode P H
1 2000 France 0001 1213 1872
2 2001 France 0002 1234 2345
3 2000 UK 0003 1726 2234
4 2001 UK 0004 6433 9082
", header=T)
df2 = read.table(text = "
IndicatorName IndicatorCode
P Power
H Happiness
", header=T)
library(dplyr)
data.frame(original_names = names(df)) %>% # get original names
left_join(df2, by=c("original_names"="IndicatorName")) %>% # join names that should be updated
mutate(new_names = ifelse(is.na(IndicatorCode), original_names, paste0(original_names,"(label=",IndicatorCode,")"))) %>% # if there is a match update the name
pull(new_names) -> list_new_names # get column of new names and store it in a vector
# update names
names(df) = list_new_names
# check new names
df
# Year Country Matchcode P(label=Power) H(label=Happiness)
# 1 2000 France 1 1213 1872
# 2 2001 France 2 1234 2345
# 3 2000 UK 3 1726 2234
# 4 2001 UK 4 6433 9082

This would work. Find the corresponding text using %in%, and use paste0 to generate the label.
colnames(df1)[4:5] <- paste0(colnames(df1)[4:5], '(label=', df2$V2[colnames(df1)[4:5] %in% df2$V1], ')')
df1
Year Country Matchcode P(label=Power) H(label=Happiness)
1 2000 France 1 1213 1872
2 2001 France 2 1234 2345
3 2000 UK 3 1726 2234
4 2001 UK 4 6433 9082
Data used
df1 <- read.table(text="Year Country Matchcode P H
1 2000 France 0001 1213 1872
2 2001 France 0002 1234 2345
3 2000 UK 0003 1726 2234
4 2001 UK 0004 6433 9082", header=T, stringsAsFactors=F)
df2 <- read.table(text="
P Power
H Happiness", header=F, stringsAsFactors=F)

If you still stick with Hmisc, you can modify the 'print' function to handle the extra information provided by the labels, or rather (and less harmfull) says to R that your data has to be printed using the labels. You can achieve this by creating a new data frame class for which the print function behaves differently.
The 'print' trick is not necessary with Rstudio that natively uses the labels together with the column names.
df1 = read.table(text = "
Year Country Matchcode P H
1 2000 France 0001 1213 1872
2 2001 France 0002 1234 2345
3 2000 UK 0003 1726 2234
4 2001 UK 0004 6433 9082 ", header=T)
df2 = read.table(text = "
var lab
P Power
H Happiness", header=T, stringsAsFactors=FALSE)
## Set the labels of the columns in df1 accordingly to df2
library(Hmisc)
for (i in 1:ncol(df1)) {
lab <- df2[df2$var==colnames(df1)[i],2]
if (length(lab!=0)) label(df1[[i]]) <- lab
}
# A print' function dedicated to 'truc' objects
# Mainly it is the code from the original 'print' except for dimnames[[2L]]
print.truc <- function (x, ..., digits = NULL, quote = FALSE, right = TRUE,
row.names = TRUE)
{
n <- length(row.names(x))
if (length(x) == 0L) {
cat(sprintf(ngettext(n, "data frame with 0 columns and %d row",
"data frame with 0 columns and %d rows"), n), "\n",
sep = "")
}
else if (n == 0L) {
print.default(names(x), quote = FALSE)
cat(gettext("<0 rows> (or 0-length row.names)\n"))
}
else {
m <- as.matrix(format.data.frame(x, digits = digits,
na.encode = FALSE))
if (!isTRUE(row.names))
dimnames(m)[[1L]] <- if (isFALSE(row.names))
rep.int("", n)
else row.names
dimnames(m)[[2L]] <- purrr::map(1:ncol(x),
function(i) {
z <- attributes(x[[i]])$label
if (length(z)!=0) z else colnames(x)[i]
})
print(m, ..., quote = quote, right = right)
}
invisible(x)
}
# Says that 'df1' is an 'enhanced' data frame
class(df1) <- c("truc",class(df1))
# Print as enhanced
print(df1)
# Eyra Country Matchcode Power Happiness
#1 2000 France 1 1213 1872
#2 2001 France 2 1234 2345
#3 2000 UK 3 1726 2234
#4 2001 UK 4 6433 9082
# Print using standard way
print(as.data.frame(df1))
# Year Country Matchcode P H
#1 2000 France 1 1213 1872
#2 2001 France 2 1234 2345
#3 2000 UK 3 1726 2234
#4 2001 UK 4 6433 9082

No need for a loop with Hmisc, can do this in one line using the option self = FALSE in the label command.
label(df1[, df2$IndicatorName], self = FALSE) <- df2$IndicatorCode
Ie.
library(Hmisc, warn.conflicts = FALSE, quietly = TRUE)
df1 = read.table(text = "
Year Country Matchcode P H
1 2000 France 0001 1213 1872
2 2001 France 0002 1234 2345
3 2000 UK 0003 1726 2234
4 2001 UK 0004 6433 9082
", header=T)
df2 = read.table(text = "
IndicatorName IndicatorCode
P Power
H Happiness
", header=T)
label(df1[, df2$IndicatorName], self = FALSE) <- df2$IndicatorCode
sapply(df1, label)
#> Year Country Matchcode P H
#> "" "" "" "Power" "Happiness"
Created on 2020-09-14 by the reprex package (v0.3.0)

Related

Casting and Melting with reshape in R

As an example let's say that I have the following dataframe:
datas=data.frame(Variables=c("Power","Happiness","Power","Happiness"),
Country=c("France", "France", "UK", "UK"), y2000=c(1213,1872,1726,2234), y2001=c(1234,2345,6433,9082))
Resulting in the following output:
Variables Country 2000 2001
1 Power France 1213 1234
2 Happiness France 1872 2345
3 Power UK 1726 6433
4 Happiness UK 2234 9082
I would like to reshape this dataframe as follows:
Year Country Power Happiness
1 2000 France 1213 1872
2 2001 France 1234 2345
3 2000 UK 1726 2234
4 2001 UK 6433 9082
I started out with:
q2=cast(datas, Country~Variables, value="2000")
But then got the following error:
Aggregation requires fun.aggregate: length used as default
Error in `[.data.frame`(sort_df(data, variables), , c(variables, "value"), :
undefined columns selected
Any suggestions?
Also: Would it matter for the solution that my dataframe is really big (417120 by 62)?
Perhaps you're interested in a tidyverse alternative
library(tidyverse)
df %>%
gather(Year, val, -Variables, -Country) %>%
spread(Variables, val)
# Country Year Happiness Power
#1 France 2000 1872 1213
#2 France 2001 2345 1234
#3 UK 2000 2234 1726
#4 UK 2001 9082 6433
Or using reshape2::melt and reshape2::dcast
reshape2::dcast(
reshape2::melt(df, id.vars = c("Country", "Variables"), variable.name = "Year"),
Country + Year ~ Variables)
# Country Year Happiness Power
#1 France 2000 1872 1213
#2 France 2001 2345 1234
#3 UK 2000 2234 1726
#4 UK 2001 9082 6433
Or (identically) using data.table::melt and data.table::dcast
data.table::dcast(
data.table::melt(df, id.vars = c("Country", "Variables"), variable.name = "Year"),
Country + Year ~ Variables)
# Country Year Happiness Power
#1 France 2000 1872 1213
#2 France 2001 2345 1234
#3 UK 2000 2234 1726
#4 UK 2001 9082 6433
In terms of performance/runtime, I imagine the data.table or tidyr solutions to be the most efficient. You can check by running a microbenchmark on some larger sample data.
Sample data
df <-read.table(text =
" Variables Country 2000 2001
1 Power France 1213 1234
2 Happiness France 1872 2345
3 Power UK 1726 6433
4 Happiness UK 2234 9082", header = T)
colnames(df)[3:4] <- c("2000", "2001")
Benchmark analysis
Following results from a microbenchmark analysis of the four methods, based on a (slightly) larger 78x22 sample dataset.
set.seed(2017)
df <- data.frame(
Variables = rep(c("Power", "Happiness", "something_else"), 26),
Country = rep(LETTERS[1:26], each = 3),
matrix(sample(10000, 20 * 26 * 3), nrow = 26 * 3))
colnames(df)[3:ncol(df)] <- 2000:2019
library(microbenchmark)
library(tidyr)
res <- microbenchmark(
reshape2 = {
reshape2::dcast(
reshape2::melt(df, id.vars = c("Country", "Variables"), variable.name = "Year"),
Country + Year ~ Variables)
},
tidyr = {
df %>%
gather(Year, val, -Variables, -Country) %>%
spread(Variables, val)
},
datatable = {
data.table::dcast(
data.table::melt(df, id.vars = c("Country", "Variables"), variable.name = "Year"),
Country + Year ~ Variables)
},
reshape = {
reshape::cast(reshape::melt(df), Country + variable ~ Variables)
}
)
res
#Unit: milliseconds
# expr min lq mean median uq max neval
# reshape2 3.088740 3.449686 4.313044 3.919372 5.112560 7.856902 100
# tidyr 4.482361 4.982017 6.215872 5.771133 6.931964 28.293377 100
# datatable 3.179035 3.511542 4.861192 4.040188 5.123103 46.010810 100
# reshape 27.371094 30.226222 32.425667 32.504644 34.118499 41.286803 100
library(ggplot2)
autoplot(res)
As above, I would strongly recommend using tidyr instead of reshape, or at least using reshape2 instead of reshape, as it fixes many of the performance issues with reshape.
In reshape itself, you have to melt datas first
> cast(melt(datas), Country + variable ~ Variables)
Using Variables, Country as id variables
Country variable Happiness Power
1 France y2000 1872 1213
2 France y2001 2345 1234
3 UK y2000 2234 1726
4 UK y2001 9082 6433
And then renaming and converting the columns as necessary.
In reshape2 the code is identical but you would use dcast instead of cast. tidyr, as in #Maurits Evers's solution above is a better solution and most development has shifted from reshape2 to the tidyverse

Dataframe does not correctly reshape

I have the following dataframe:
Variables Varcode Country Ccode 2000 2001
1 Power P France FR 1213 1234
2 Happiness H France FR 1872 2345
3 Power P UK UK 1726 6433
4 Happiness H UK UK 2234 9082
I would like to reshape this dataframe as follows:
Year Country Ccode P(label=Power) H(label=Happiness)
1 2000 France FR 1213 1872
2 2001 France FR 1234 2345
3 2000 UK UK 1726 2234
4 2001 UK UK 6433 9082
The original code was as follows:
library(tidyverse)
df %>%
gather(Year, val, -Variables, -Country) %>%
spread(Variables, val)
I tried to expand the code because, the Ccode and Indicator Code ended up as a row in the list and I decided I wanted to use the codes as variable names and the variable names as labels (please note that because of that I swapped -Variables and Variables with -Varcode and Varcode respectively):
library(tidyverse)
library(Hmisc)
List <- df$Variables
df<-df %>%
gather(Year, val, -Varcode, -Country) %>%
spread(Varcode, val)
for(i in List){
label(df[,i]) <- List[i]
}
Please note: I am using a list because of memory limitations.
I ran into two problems:
The transformation does not go smoothly because two additional columns from df(among which Variables) are added where values should be.
The label function gives an error.
Can anyone help me figuring out what goes wrong?
I think you went wrong with your selection of columns to gather
Data:
df <- read.table(text = "Variables Varcode Country 2000 2001
1 Power P France 1213 1234
2 Happiness H France 1872 2345
3 Power P UK 1726 6433
4 Happiness H UK 2234 9082", header = TRUE, stringsAsFactors = FALSE) %>%
rename(`2000` = X2000, `2001` = X2001)
df %>%
select(-Varcode) %>%
gather(Year, val,`2000`:`2001`) %>%
unite(Country_Ccode, Country, Ccode, sep = "_") %>%
spread(Variables, val) %>%
separate(Country_Ccode, c("Country", "Ccode"), sep = "_")
Output
Country Ccode Year Happiness Power
1 France FR 2000 1872 1213
2 France FR 2001 2345 1234
3 UK UK 2000 2234 1726
4 UK UK 2001 9082 6433

From monadic to dyadic data in R

For the sake of simplicity, let's say I have a dataset at the country-year level, that lists organizations that received aid from a government, how much money was that, and the type of project. The data frame has "space" for 10 organizations each year, but not every government subsidizes so many organizations each year, so there are a lot a blank spaces. Moreover, they do not follow any order: one organization can be in the first spot one year, and the next year be coded in the second spot. The data looks like this:
> State Year Org1 Aid1 Proj1 Org2 Aid2 Proj2 Org3 Aid3 Proj3 Org4 Aid4 Proj4 ...
Italy 2000 A 1000 Arts B 500 Arts C 300 Social
Italy 2001 B 700 Social A 1000 Envir
Italy 2002 A 1000 Arts C 300 Envir
UK 2000
UK 2001 Z 2000 Social
UK 2002 Z 2000 Social
...
I'm trying to transform this into dyadic data, which would look like this:
> State Org Year Aid Proj
Italy A 2000 1000 Arts
Italy A 2001 1000 Envir
Italy A 2002 1000 Arts
Italy B 2000 500 Arts
Italy B 2001 700 Social
Italy C 2000 300 Social
Italy C 2002 300 Envir
UK Z 2001 2000 Social
...
I'm using R, and the best way I could find was building a pre-defined possible set of dyads —using something like expand.grid(unique(State), unique(Org))— and then looping through the data, finding the corresponding column and filling the data frame. But I don't thing this is the most effective method, so I was wondering whether there would be a better way. I thought about dplyror reshape but can't find a solution.
I know this is a recurring question, but couldn't really find an answer. The most similar question is this one, but it's not exactly the same.
Thanks a lot in advance.
Since you did not use dput, I will try and make some data that resemble yours:
dat = data.frame(State = rep(c("Italy", "UK"), 3),
Year = rep(c(2014, 2015, 2016), 2),
Org1 = letters[1:6],
Aid1 = sample(800:1000, 6),
Proj1 = rep(c("A", "B"), 3),
Org2 = letters[7:12],
Aid2 = sample(600:700, 6),
Proj2 = rep(c("C", "D"), 3),
stringsAsFactors = FALSE)
dat
# State Year Org1 Aid1 Proj1 Org2 Aid2 Proj2
# 1 Italy 2014 a 910 A g 658 C
# 2 UK 2015 b 926 B h 681 D
# 3 Italy 2016 c 834 A i 625 C
# 4 UK 2014 d 858 B j 620 D
# 5 Italy 2015 e 831 A k 650 C
# 6 UK 2016 f 821 B l 687 D
Next I gather the data and then use extract to make 2 new columns and then spread it all again:
library(tidyr)
library(dplyr)
dat %>%
gather(key, value, -c(State, Year)) %>%
extract(key, into = c("key", "num"), "([A-Za-z]+)([0-9]+)") %>%
spread(key, value) %>%
select(-num)
# State Year Aid Org Proj
# 1 Italy 2014 910 a A
# 2 Italy 2014 658 g C
# 3 Italy 2015 831 e A
# 4 Italy 2015 650 k C
# 5 Italy 2016 834 c A
# 6 Italy 2016 625 i C
# 7 UK 2014 858 d B
# 8 UK 2014 620 j D
# 9 UK 2015 926 b B
# 10 UK 2015 681 h D
# 11 UK 2016 821 f B
# 12 UK 2016 687 l D
Is this the desired output?

Add new column if range of columns contains string in R

I have a dataframe like below. I would like to add 2 columns:
ContainsANZ: Indicates if any of the columns from F0 to F3 contain 'Australia' or 'New Zealand' ignoring NA values
AllANZ: Indicates if all non NA columns contain 'Australia' or 'New Zealand'
Starting dataframe would be:
dfContainsANZ
Col.A Col.B Col.C F0 F1 F2 F3
1 data 0 xxx Australia Singapore <NA> <NA>
2 data 1 yyy United States United States United States <NA>
3 data 0 zzz Australia Australia Australia Australia
4 data 0 ooo Hong Kong London Australia <NA>
5 data 1 xxx New Zealand <NA> <NA> <NA>
The end result should look like this:
df
Col.A Col.B Col.C F0 F1 F2 F3 ContainsANZ AllANZ
1 data 0 xxx Australia Singapore <NA> <NA> Australia undefined
2 data 1 yyy United States United States United States <NA> undefined undefined
3 data 0 zzz Australia Australia Australia Australia Australia Australia
4 data 0 ooo Hong Kong London Australia <NA> Australia undefined
5 data 1 xxx New Zealand <NA> <NA> <NA> New Zealand New Zealand
I'm using dplyr (preferred solution) and have come up with a code like this which doesn't work and is very repetitive. Is there a better way to write this so that I am not having to copy F0|F1|F2... rules over again? My real data set has more. Is NAs interfering with the code?
df <- df %>%
mutate(ANZFlag =
ifelse(
F0 == 'Australia' |
F1 == 'Australia' |
F2 == 'Australia' |
F3 == 'Australia',
'Australia',
ifelse(
F0 == 'New Zealand' |
F1 == 'New Zealand' |
F2 == 'New Zealand' |
F3 == 'New Zealand',
'New Zealand', 'undefined'
)
)
)
Still some typing, but I think this gets at the essence you're looking for:
library(dplyr)
df <- read.table(text='Col.A,Col.B,Col.C,F0,F1,F2,F3
data,0,xxx,Australia,Singapore,NA,NA
data,1,yyy,"United States","United States","United States",NA
data,0,zzz,Australia,Australia,Australia,Australia
data,0,ooo,"Hong Kong",London,Australia,NA
data,1,xxx,"New Zealand",NA,NA,NA', header=TRUE, sep=",", stringsAsFactors=FALSE)
down_under <- function(x) {
mtch <- c("Australia", "New Zealand")
cols <- unlist(x)[c("F0", "F1", "F2", "F3")]
bind_cols(x, data_frame(ContainsANZ=any(mtch %in% cols, na.rm=TRUE),
AllANZ=all(as.vector(na.omit(cols)) %in% cols)))
}
rowwise(df) %>% do(down_under(.))
## Source: local data frame [5 x 9]
## Groups: <by row>
##
## Col.A Col.B Col.C F0 F1 F2 F3 ContainsANZ AllANZ
## (chr) (int) (chr) (chr) (chr) (chr) (chr) (lgl) (lgl)
## 1 data 0 xxx Australia Singapore NA NA TRUE TRUE
## 2 data 1 yyy United States United States United States NA FALSE TRUE
## 3 data 0 zzz Australia Australia Australia Australia TRUE TRUE
## 4 data 0 ooo Hong Kong London Australia NA TRUE TRUE
## 5 data 1 xxx New Zealand NA NA NA TRUE TRUE

R Cleaning and reordering names/serial numbers in data frame

Let's say I have a data frame as follows in R:
Data <- data.frame("SerialNum" = character(), "Year" = integer(), "Name" = character(), stringsAsFactors = F)
Data[1,] <- c("983\n837\n424\n ", 2015, "Michael\nLewis\nPaul\n ")
Data[2,] <- c("123\n456\n789\n136", 2014, "Elaine\nJerry\nGeorge\nKramer")
Data[3,] <- c("987\n654\n321\n975\n ", 2010, "John\nPaul\nGeorge\nRingo\nNA")
Data[4,] <- c("424\n983\n837", 2015, "Paul\nMichael\nLewis")
Data[5,] <- c("456\n789\n123\n136", 2014, "Jerry\nGeorge\nElaine\nKramer")
What I want to do is the following:
Split up each string of names and each string of serial numbers so that they are their own vectors (or a list of string vectors).
Eliminate any character "NA" in either set of vectors or any blank spaces denoted by "...\n ".
Reorder each list of names alphabetically and reorder the corresponding serial numbers according to the same permutation.
Concatenate each vector in the same fashion it was originally (I usually do this with paste(., collapse = "\n")).
My issue is how to do this without using a for loop. What is an object-oriented way to do this? As a first attempt in this direction I originally made a list by the command LIST <- strsplit(Data$Name, split = "\n") and from here I need a for loop in order to find the permutations of the names, which seems like a process that won't scale according to my actual data. Additionally, once I make the list LIST I'm not sure how I go about removing NA symbols or blank spaces. Any help is appreciated!
Using lapply I take each row of the data frame and turn it into a new data frame with one name per row. This creates a list of 5 data frames, one for each row of the original data frame.
seinfeld = lapply(1:nrow(Data), function(i) {
# Turn strings into data frame with one name per row
dat = data.frame(SerialNum=unlist(strsplit(Data[i,"SerialNum"], split="\n")),
Year=Data[i,"Year"],
Name=unlist(strsplit(Data[i,"Name"], split="\n")))
# Get rid of empty strings and NA values
dat = dat[!(dat$Name %in% c(""," ","NA")), ]
# Order alphabetically
dat = dat[order(dat$Name), ]
})
UPDATE: Based on your comment, let me know if this is the result you're trying to achieve:
seinfeld = lapply(1:nrow(Data), function(i) {
# Turn strings into data frame with one name per row
dat = data.frame(SerialNum=unlist(strsplit(Data[i,"SerialNum"], split="\n")),
Name=unlist(strsplit(Data[i,"Name"], split="\n")))
# Get rid of empty strings and NA values
dat = dat[!(dat$Name %in% c(""," ","NA")), ]
# Order alphabetically
dat = dat[order(dat$Name), ]
# Collapse back into a single row with the new sort order
dat = data.frame(SerialNum=paste(dat[, "SerialNum"], collapse="\n"),
Year=Data[i, "Year"],
Name=paste(dat[, "Name"], collapse="\n"))
})
do.call(rbind, seinfeld)
SerialNum Year Name
1 837\n983\n424 2015 Lewis\nMichael\nPaul
2 123\n789\n456\n136 2014 Elaine\nGeorge\nJerry\nKramer
3 321\n987\n654\n975 2010 George\nJohn\nPaul\nRingo
4 837\n983\n424 2015 Lewis\nMichael\nPaul
5 123\n789\n456\n136 2014 Elaine\nGeorge\nJerry\nKramer
eipi10 offered a great answer. In addition to that, I'd like to leave what I tried mainly with data.table. First, I split two columns (i.e., SerialNum and Name) with cSplit(), added an index with add_rownames(), and split the data by the index. In the first lapply(), I used Stacked() from the splitstackshape package. I stacked SerialNum and Name; separated SeriaNum and Name become two columns, as you see in a part of temp2. In the second lapply(), I used merge from the data.table package. Then, I removed rows with NAs (lapply(na.omit)), combined all data tables (rbindlist), and changed order of rows by rowname, which is row number of the original data) and Name (setorder(rowname, Name))
library(data.table)
library(splitstackshape)
library(dplyr)
cSplit(mydf, c("SerialNum", "Name"), direction = "wide",
type.convert = FALSE, sep = "\n") %>%
add_rownames %>%
split(f = .$rowname) -> temp
#a part of temp
#$`1`
#Source: local data frame [1 x 12]
#
#rowname Year SerialNum_1 SerialNum_2 SerialNum_3 SerialNum_4 SerialNum_5 Name_1 Name_2
#(chr) (dbl) (chr) (chr) (chr) (chr) (chr) (chr) (chr)
#1 1 2015 983 837 424 NA NA Michael Lewis
#Variables not shown: Name_3 (chr), Name_4 (chr), Name_5 (chr)
lapply(temp, function(x){
Stacked(x, var.stubs = c("SerialNum", "Name"), sep = "_")
}) -> temp2
# A part of temp2
#$`1`
#$`1`$SerialNum
# rowname Year .time_1 SerialNum
#1: 1 2015 1 983
#2: 1 2015 2 837
#3: 1 2015 3 424
#4: 1 2015 4 NA
#5: 1 2015 5 NA
#
#$`1`$Name
# rowname Year .time_1 Name
#1: 1 2015 1 Michael
#2: 1 2015 2 Lewis
#3: 1 2015 3 Paul
#4: 1 2015 4 NA
#5: 1 2015 5 NA
lapply(1:nrow(mydf), function(x){
merge(temp2[[x]]$SerialNum, temp2[[x]]$Name, by = c("rowname", "Year", ".time_1"))
}) %>%
lapply(na.omit) %>%
rbindlist %>%
setorder(rowname, Name) -> out
print(out)
# rowname Year .time_1 SerialNum Name
# 1: 1 2015 2 837 Lewis
# 2: 1 2015 1 983 Michael
# 3: 1 2015 3 424 Paul
# 4: 2 2014 1 123 Elaine
# 5: 2 2014 3 789 George
# 6: 2 2014 2 456 Jerry
# 7: 2 2014 4 136 Kramer
# 8: 3 2010 3 321 George
# 9: 3 2010 1 987 John
#10: 3 2010 2 654 Paul
#11: 3 2010 4 975 Ringo
#12: 4 2015 3 837 Lewis
#13: 4 2015 2 983 Michael
#14: 4 2015 1 424 Paul
#15: 5 2014 3 123 Elaine
#16: 5 2014 2 789 George
#17: 5 2014 1 456 Jerry
#18: 5 2014 4 136 Kramer
DATA
mydf <- structure(list(SerialNum = c("983\n837\n424\n ", "123\n456\n789\n136",
"987\n654\n321\n975\n ", "424\n983\n837", "456\n789\n123\n136"
), Year = c(2015, 2014, 2010, 2015, 2014), Name = c("Michael\nLewis\nPaul\n ",
"Elaine\nJerry\nGeorge\nKramer", "John\nPaul\nGeorge\nRingo\nNA",
"Paul\nMichael\nLewis", "Jerry\nGeorge\nElaine\nKramer")), .Names = c("SerialNum",
"Year", "Name"), row.names = c(NA, -5L), class = "data.frame")

Resources