Combine unique rows from multiple data.tables and add attribute details

Combine unique rows from multiple data.tables and add attribute details - r

I have two data.tables in this format (the actual tables have about a million rows in each):
library(data.table)
dt1 <- data.table(
code=c("A001", "A002","A003","A004","A005"),
x=c(65,92,25,450,12),
y=c(98,506,72,76,15),
no1=c(010101, 010156, 028756, 372576,367383),
no2=c(876362,"",682973,78269,"")
)
dt2 <- data.table(
code=c("A003", "A004","A005","A006","A007","A008","A009"),
x=c(25,126,12,55,34,134,55),
y=c(72,76,890,568,129,675,989),
no1=c(028756, 372576,367383,234876, 287156, 123348, 198337),
no2=c(682973,78269,65378,"","","",789165)
)
I would like to combine the two together and keep only unique rows based on all column entries being unique. This is what I have but I assume there is a better way of doing it:
dt3 <- rbindlist(list(dt1, dt2))
dt3 <- unique(dt3, by = c("code", "x", "y", "no1", "no2"))
Once I have this single dataset I would like to give any duplicate 'code' records some attribute information (version number and a comment about what's different in that version to the previous one). The output I am looking for would be this:
dt4 <- data.table(
code=c("A001", "A002","A003","A004","A005", "A004","A005","A006","A007","A008","A009"),
x=c(65,92,25,450,12,126,12,55,34,134,55),
y=c(98,506,72,76,15,76,890,568,129,675,989),
no1=c(010101, 010156, 028756, 372576,367383, 372576,367383,234876, 287156, 123348, 198337),
no2=c(876362,"",682973,78269,"",78269,65378,"","","",789165),
version = c("V1","V1","V1","V1","V1","V2","V2","V1","V1","V1","V1"),
unique_version=c("A001_V1", "A002_V1","A003_V1","A004_V1","A005_V1", "A004_V2","A005_V2","A006_V1","A007_V1","A008_V1","A009_V1"),
comment = c("First_entry","First_entry","First_entry","First_entry","First_entry","New_x", "New_y_and_no2","First_entry","First_entry","First_entry","First_entry")
)
I'm not sure how to achieve dt4 (and in an efficient way considering the size of the real dataset will be over a million rows).
Edit
Having applied #Chase's solution to my real data I noticed my dt3 example varies slightly from the type of result I am getting. This looks more like my real data:
dt6 <- data.table(
code=c("A111", "A111","A111","A111","A111", "A111","A111","A234", "A234","A234","A234","A234", "A234","A234"),
x=c("",126,126,"",836,843,843,126,126,"",127,836,843,843),
y=c("",76,76,"",456,465,465,76,76,"",77,456,465,465),
no1=c(028756, 028756,028756,057756, 057756, 057756, 057756,028756, 028756,057756,057756, 057756, 057756, 057756),
no2=c("","",034756,"","","",789165,"",034756,"","","","",789165)
)
comp_cols <- c("x", "y", "no1", "no2")
#grabs the names of the mismatching values and formats them how you did
f <- function(x,y) {
n_x <- names(x)
diff <- x != y
paste0("New_", paste0(n_x[diff], collapse = "_and_"))
}
dt6[, version := paste0("V", 1:.N), by = code]
dt6[, unique_version := paste(code, version, sep = "_")]
dt6[, comment := ifelse(version == "V1", "First_entry", f(.SD[1], .SD[2])), by = code, .SDcols = comp_cols]
As you can see the suggested solution to create the comment column seems to be returning only the first change between the first and second versions (and not the changes better V2 and V3 etc.)

Here's one solution - the first two are trivial, the comment takes a little more thought:
dt5 <- copy(dt3)
comp_cols <- c("x", "y", "no1", "no2")
#grabs the names of the mismatching values and formats them how you did
f <- function(x,y) {
n_x <- names(x)
diff <- x != y
paste0("New_", paste0(n_x[diff], collapse = "_and"))
}
dt5[, version := paste0("V", 1:.N), by = code]
dt5[, unique_version := paste(code, version, sep = "_")]
dt5[, comment := ifelse(version == "V1", "First_entry", f(.SD[1], .SD[2])), by = code, .SDcols = comp_cols]
End up yielding this:
> dt5
code x y no1 no2 version unique_version comment
1: A001 65 98 10101 876362 V1 A001_V1 First_entry
2: A002 92 506 10156 V1 A002_V1 First_entry
3: A003 25 72 28756 682973 V1 A003_V1 First_entry
4: A004 450 76 372576 78269 V1 A004_V1 First_entry
5: A005 12 15 367383 V1 A005_V1 First_entry
6: A004 126 76 372576 78269 V2 A004_V2 New_x
7: A005 12 890 367383 65378 V2 A005_V2 New_y_andno2
8: A006 55 568 234876 V1 A006_V1 First_entry
9: A007 34 129 287156 V1 A007_V1 First_entry
10: A008 134 675 123348 V1 A008_V1 First_entry
11: A009 55 989 198337 789165 V1 A009_V1 First_entry

Related

cbind and match in data.frame structure

As I am new to the data.table package, I would like to replicate what I would normally do in a data.frame structure below, to a data.table structure.
Dta <- data.frame(Customer = c("Javier","Oscar","Ivan","Peter"),Type_of_Customer=LETTERS[c(1,1:3)])
Dtb <- data.frame(Customer = c("Javier","Oscar","Ivan","Jack"),Zone=5:8,District=100:103)
Result <- cbind(Dtb[match(Dtb[,"Customer"],Dta[,"Customer"]),c("Zone","District")],Dta)
ww <- which(is.na(Result[,"Zone"]))
if(length(ww) > 0){
Result[ww,"Zone"] <- "Not in Dtb"
}
ww <- which(is.na(Result[,"District"]))
if(length(ww) > 0){
Result[ww,"District"] <- "Not in Dtb"
}
So If I had Dta and Dtb as data.table structure, what would be the way to go?
(Note: In the real sample I have around 10 million rows so I would need the more time-efficient solution)
Dta <- data.table(Custumer = c("Javier","Oscar","Ivan","Peter"),Type_of_Customer=LETTERS[c(1,1:3)])
Dtb <- data.table(Custumer = c("Javier","Oscar","Ivan","Jack"),Zone=5:8,District=100:103)
Thanks.

We can use a join on thee 'Custumer' and replace the NA elements with 'Not in 'Dtb' string
Dtb[Dta, on = .(Custumer)][, c("Zone", "District") :=
.(as.character(Zone), as.character(District))
][is.na(Zone), c("Zone", "District") := "Not in Dtb"][]
# Custumer Zone District Type_of_Customer
#1: Javier 5 100 A
#2: Oscar 6 101 A
#3: Ivan 7 102 B
#4: Peter Not in Dtb Not in Dtb C

Merge data frames based on numeric rownames within a chosen threshold and keeping unmatched rows as well

How can I merge two data frames based on their numeric rownames while setting a threshold for the match?
df1 <- structure(list(c(4974622.505928, 170582.149747, 130545.004516,
143528.819582, 49416.594892, 51879.515558, 52027.462651, 42491.317116,
49173.145029, 44040.01261), c(4664319.00309, 266278.599338, 204772.412837,
204819.210688, 77718.961761, 82742.852809, 79706.774944, 67123.603629,
67264.401059, 66750.260768), c(5906075.502923, 385318.121061,
296824.944672, 308432.753482, 113407.50333, 120352.400266, 122622.356104,
98656.179336, 107669.002489, 100262.855064), c(5401712.020682,
204595.653994, 163485.509823, 179567.339348, 62690.116298, 63790.0244,
64660.971879, 52545.84055, 59080.66972, 54579.538267), c(5273676.522307,
159130.126808, 129607.971309, 142279.787439, 45812.561022, 47230.447746,
48367.405274, 39578.235275, 45489.065198, 43102.923417)), row.names = c("34.9816256",
"35.0576674", "35.0898006", "35.1270264", "35.1738664", "35.1936282",
"35.2043582", "35.2359934", "35.2716016", "35.2993064"), class = "data.frame")
df2 <- structure(list(c(5898584.48405, 302326.226264, 185567.968257,
205617.778019, 84476.66928, 65505.560486, 68121.465276, 63221.947902,
55028.866127, 36821.607091), c(3719350.766633, 108177.577417,
68855.378083, 78201.248427, 17558.118703, 23387.078772, 25374.978916,
18833.579115, 12761.529092, 11507.348928), c(3587498.99736, 96793.741428,
59750.485295, 70217.309923, 26233.188472, 20200.080468, 22241.999451,
20268.485836, 17330.391134, 12503.133961), c(3128479.008712,
70298.795438, 45668.592667, 56013.453832, 20323.368372, 16795.27218,
16358.208042, 15722.790712, 12276.726458, 9155.522864), c(3847005.494149,
138762.296854, 94196.099405, 106888.964213, 36614.870588, 30856.787329,
33880.704043, 31399.328936, 27819.255931, 18560.05768)), row.names = c("34.9815906",
"35.0356588", "35.0897702", "35.1269978", "35.1535182", "35.1744048",
"35.1952968", "35.3032464", "35.3207828", "35.3739834"), class = "data.frame")
Output of the dataframes (first row is rownames)
> df1
34.9816256 4974622.51 4664319.00 5906075.50 5401712.02 5273676.52
35.0576674 170582.15 266278.60 385318.12 204595.65 159130.13
35.0898006 130545.00 204772.41 296824.94 163485.51 129607.97
35.1270264 143528.82 204819.21 308432.75 179567.34 142279.79
35.1738664 49416.59 77718.96 113407.50 62690.12 45812.56
35.1936282 51879.52 82742.85 120352.40 63790.02 47230.45
35.2043582 52027.46 79706.77 122622.36 64660.97 48367.41
35.2359934 42491.32 67123.60 98656.18 52545.84 39578.24
35.2716016 49173.15 67264.40 107669.00 59080.67 45489.07
35.2993064 44040.01 66750.26 100262.86 54579.54 43102.92
> df2
34.9815906 5898584.48 3719350.77 3587499.00 3128479.009 3847005.49
35.0356588 302326.23 108177.58 96793.74 70298.795 138762.30
35.0897702 185567.97 68855.38 59750.49 45668.593 94196.10
35.1269978 205617.78 78201.25 70217.31 56013.454 106888.96
35.1535182 84476.67 17558.12 26233.19 20323.368 36614.87
35.1744048 65505.56 23387.08 20200.08 16795.272 30856.79
35.1952968 68121.47 25374.98 22242.00 16358.208 33880.70
35.3032464 63221.95 18833.58 20268.49 15722.791 31399.33
35.3207828 55028.87 12761.53 17330.39 12276.726 27819.26
35.3739834 36821.61 11507.35 12503.13 9155.523 18560.06
I want to merge these two datasets based on their rownames IF the difference between the two numbers in the rownames is between [-0.02, 0.02]
In other words, each rowname from df1 should be compared to each rowname in df2, and if two rownames are found with their difference falling in the range [-0.02, 0.02], then the data can be merged on the same row. If a match is not found, NA will be added to where there was no matching data from the other df (as in full_join).

You need column names and add rownames as a columns
library(tibble)
colnames(df1) <- c('a1', 'b1', 'c1', 'd1', 'e1')
df1 <- rownames_to_column(df1, "rn1")
colnames(df2) <- c('a2', 'b2', 'c2', 'd2', 'e2')
df2 <- rownames_to_column(df2, "rn2")
concatenate the two data frames
df3 <- cbind(df1, df2)
Calculate the difference between rownames
df3['diff'] <- as.numeric(df3$rn1) - as.numeric(df3$rn2)
filter and drop the columns you do not need
library(tidyverse)
df4 <- df3 %>%
filter(diff >= -0.02 & diff <= 0.02) %>%
select(-c(rn1, rn2, diff))
# a1 b1 c1 d1 e1 a2 b2 c2 d2 e2
#1 4974622.51 4664319.00 5906075.5 5401712.02 5273676.52 5898584.48 3719350.77 3587499.00 3128479.01 3847005.49
#2 130545.00 204772.41 296824.9 163485.51 129607.97 185567.97 68855.38 59750.49 45668.59 94196.10
#3 143528.82 204819.21 308432.8 179567.34 142279.79 205617.78 78201.25 70217.31 56013.45 106888.96
#4 51879.52 82742.85 120352.4 63790.02 47230.45 65505.56 23387.08 20200.08 16795.27 30856.79
#5 52027.46 79706.77 122622.4 64660.97 48367.41 68121.47 25374.98 22242.00 16358.21 33880.70

You can use foverlaps from data.table package
library(data.table)
#add column names to sample data as it's NULL currently
names(df1) <- paste0("df1_", 1:ncol(df1))
names(df2) <- paste0("df2_", 1:ncol(df2))
#convert rownames as first column
setDT(df1, keep.rownames = TRUE)[]
setnames(df1, 1, "df1_rn")
setDT(df2, keep.rownames = TRUE)[]
setnames(df2, 1, "df2_rn")
#add temporary columns to both data tables
df1[, `:=`(df1_rn = as.numeric(df1_rn), temp = as.numeric(df1_rn))]
df2[, `:=`(df2_rn_minus_2 = as.numeric(df2_rn) - 0.02, df2_rn_plus_2 = as.numeric(df2_rn) + 0.02)]
setkey(df2, df2_rn_minus_2, df2_rn_plus_2)
DT = foverlaps(df1, df2, by.x = c("df1_rn", "temp"))[, !c("df2_rn_minus_2", "df2_rn_plus_2", "temp"), with = F]
which gives
> DT
df2_rn df2_1 df2_2 df2_3 df2_4 df2_5 df1_rn df1_1 df1_2 df1_3
1: 34.9815906 5898584.48 3719350.77 3587499.00 3128479.01 3847005.49 34.98163 4974622.51 4664319.00 5906075.50
2: <NA> NA NA NA NA NA 35.05767 170582.15 266278.60 385318.12
3: 35.0897702 185567.97 68855.38 59750.49 45668.59 94196.10 35.08980 130545.00 204772.41 296824.94
4: 35.1269978 205617.78 78201.25 70217.31 56013.45 106888.96 35.12703 143528.82 204819.21 308432.75
5: 35.1744048 65505.56 23387.08 20200.08 16795.27 30856.79 35.17387 49416.59 77718.96 113407.50
6: 35.1744048 65505.56 23387.08 20200.08 16795.27 30856.79 35.19363 51879.52 82742.85 120352.40
7: 35.1952968 68121.47 25374.98 22242.00 16358.21 33880.70 35.19363 51879.52 82742.85 120352.40
8: 35.1952968 68121.47 25374.98 22242.00 16358.21 33880.70 35.20436 52027.46 79706.77 122622.36
9: <NA> NA NA NA NA NA 35.23599 42491.32 67123.60 98656.18
10: <NA> NA NA NA NA NA 35.27160 49173.15 67264.40 107669.00
11: 35.3032464 63221.95 18833.58 20268.49 15722.79 31399.33 35.29931 44040.01 66750.26 100262.86
df1_4 df1_5
1: 5401712.02 5273676.52
2: 204595.65 159130.13
3: 163485.51 129607.97
4: 179567.34 142279.79
5: 62690.12 45812.56
6: 63790.02 47230.45
7: 63790.02 47230.45
8: 64660.97 48367.41
9: 52545.84 39578.24
10: 59080.67 45489.07
11: 54579.54 43102.92

Splitting data.table column into many unknow number of columns based on pattern

I want to split column B of data.table dt1 into many column based on space between its values.
df1 <-
structure(list(B = c("3,845,168 15,467,645 15,054,813 913 30,523,371",
"3,104,154 12,495,278 12,298,236 223 24,793,737", "741,014 2,972,367 2,756,577 690 5,729,634",
"218,044 1,035,308 1,008,748 18 2,044,074", "200,744 961,775 942,901 13 1,904,689",
"17,300 73,533 65,847 5 139,385"), C = c("17,743,645", "14,456,435",
"3,287,210", "1,165,692", "1,071,138", "94,554"), D = c("102.74",
"101.60", "107.83", "102.63", "102.00", "111.67"), E = c("2.89",
"2.87", "2.96", "2.99", "3.07", "2.06")), .Names = c("B", "C",
"D", "E"), row.names = c(NA, -6L), class = "data.frame"
)
library(data.table)
dt1 <- data.table(df1)
dt1
B C D E
1: 3,845,168 15,467,645 15,054,813 913 30,523,371 17,743,645 102.74 2.89
2: 3,104,154 12,495,278 12,298,236 223 24,793,737 14,456,435 101.60 2.87
3: 741,014 2,972,367 2,756,577 690 5,729,634 3,287,210 107.83 2.96
4: 218,044 1,035,308 1,008,748 18 2,044,074 1,165,692 102.63 2.99
5: 200,744 961,775 942,901 13 1,904,689 1,071,138 102.00 3.07
6: 17,300 73,533 65,847 5 139,385 94,554 111.67 2.06

We could use the tstrsplit
tmp <- dt1[, tstrsplit(B, "\\s+")]
dt1[, paste0("B", seq_along(tmp)) := tmp]
rm(tmp)
Or as #DavidArenburg mentioned, we can avoid the creation of temporary object by first finding out the number of spaces with stri_count_fixed from stringi and then use tstrsplit with fixed = TRUE argument
M <- max(stringi::stri_count_fixed(dt1$B, " ")) + 1
dt1[, paste0("B", seq_len(M)) := tstrsplit(B, " ", fixed = TRUE)]
Update
As the , is not considered in a numeric column, we remove that and split with type.convert = TRUE in tstrsplit
dt1[, paste0("B", seq_len(M)) := tstrsplit(gsub(",", "", B), " ",
fixed = TRUE, type.convert = TRUE)]

Divide list of columns by a second list of columns

I have script that generates a data.table with some columns I want to divide by some other columns and store the results in new columns. Here's an example.
library(data.table)
dt <- data.table(V1 = c( 5.553465, 4.989168, 2.563682, 6.987971, 19.220936),
V2 = c(4.248335, 19.768138, 3.840026, 17.411003, 17.939368),
V3 = c(9.683953, 15.344424, 11.729091, 7.534210, 5.404000),
V4 = c(5.949093, 4.553023, 9.765656, 11.211069, 4.085964),
V5 = c(11.814671, 5.460138, 2.492230, 1.48792, 8.164280))
list1 <- list(c("V1", "V2", "V3"))
list2 <- list(c("V2", "V4", "V5"))
listRatio <- list(c("rat1","rat2","rat3"))
I have tried a variety of approaches to dividing the values in the list1 elements by the values in the list2 elements, unsuccessfully. Two are below; neither works.
dt[, (listRatio) := list1/list2]
dt[, c("rat1","rat2","rat3") := mapply(dt, function(x,y) x / y, x = c(V1, V2, V3), y = c(V2, V4, V5))]

We need to convert the list to vector by using [[ and then get the values of each vector in a list with mget, use Map to divide (/) the corresponding columns of each of the list values and assign it to the vector (listRatio[[1]]).
dt[, (listRatio[[1]]) := Map(`/`, mget(list1[[1]]), mget(list2[[1]]))]
dt
# V1 V2 V3 V4 V5 rat1 rat2 rat3
#1: 5.553465 4.248335 9.683953 5.949093 11.814671 1.3072098 0.7141147 0.8196549
#2: 4.989168 19.768138 15.344424 4.553023 5.460138 0.2523843 4.3417611 2.8102630
#3: 2.563682 3.840026 11.729091 9.765656 2.492230 0.6676210 0.3932174 4.7062635
#4: 6.987971 17.411003 7.534210 11.211069 1.487920 0.4013537 1.5530190 5.0635854
#5: 19.220936 17.939368 5.404000 4.085964 8.164280 1.0714389 4.3904861 0.6619077
NOTE: As #Frank mentioned in the comments, it is better to create a vector of variables names and not a list.

By using data.frame function
dt <- data.frame(V1 = c( 5.553465, 4.989168, 2.563682, 6.987971, 19.220936),
V2 = c(4.248335, 19.768138, 3.840026, 17.411003, 17.939368),
V3 = c(9.683953, 15.344424, 11.729091, 7.534210, 5.404000),
V4 = c(5.949093, 4.553023, 9.765656, 11.211069, 4.085964),
V5 = c(11.814671, 5.460138, 2.492230, 1.48792, 8.164280))
list1 <- list(dt[,c("V1", "V2", "V3")])
list2 <- list(dt[,c("V2", "V4", "V5")])
dt$rat3 <- dt$rat2 <- dt$rat1 <- ""
dt[, c("rat1","rat2","rat3")] <- unlist(list1)/unlist(list2)
V1 V2 V3 V4 V5 rat1 rat2 rat3
1 5.553465 4.248335 9.683953 5.949093 11.814671 1.3072098 0.7141147 0.8196549
2 4.989168 19.768138 15.344424 4.553023 5.460138 0.2523843 4.3417611 2.8102630
3 2.563682 3.840026 11.729091 9.765656 2.492230 0.6676210 0.3932174 4.7062635
4 6.987971 17.411003 7.534210 11.211069 1.487920 0.4013537 1.5530190 5.0635854
5 19.220936 17.939368 5.404000 4.085964 8.164280 1.0714389 4.3904861 0.6619077

Extracting Column data from .csv and turning every 10 consecutive rows into corresponding columns

Below is the code I am trying to implement. I want to extract this 10 consecutive values of rows and turn them into corresponding columns .
This is how data looks like: https://drive.google.com/file/d/0B7huoyuu0wrfeUs4d2p0eGpZSFU/view?usp=sharing
I have been trying but temp1 and temp2 comes out to be empty. Please help.
library(Hmisc) #for increment function
myData <- read.csv("Clothing_&_Accessories.csv",header=FALSE,sep=",",fill=TRUE) # reading the csv file
extract<-myData$V2 # extracting the desired column
x<-1
y<-1
temp1 <- NULL #initialisation
temp2 <- NULL #initialisation
data.sorted <- NULL #initialisation
limit<-nrow(myData) # Calculating no of rows
while (x! = limit) {
count <- 1
for (count in 11) {
if (count > 10) {
inc(x) <- 1
break # gets out of for loop
}
else {
temp1[y]<-data_mat[x] # extracting by every row element
}
inc(x) <- 1 # increment x
inc(y) <- 1 # increment y
}
temp2<-temp1
data.sorted<-rbind(data.sorted,temp2) # turn rows into columns
}

Your code is too complex. You can do this using only one for loop, without external packages, likes this:
myData <- as.data.frame(matrix(c(rep("a", 10), "", rep("b", 10)), ncol=1), stringsAsFactors = FALSE)
newData <- data.frame(row.names=1:10)
for (i in 1:((nrow(myData)+1)/11)) {
start <- 11*i - 10
newData[[paste0("col", i)]] <- myData$V1[start:(start+9)]
}
You don't actually need all this though. You can simply remove the empty lines, split the vector in chunks of size 10 (as explained here) and then turn the list into a data frame.
vec <- myData$V1[nchar(myData$V1)>0]
as.data.frame(split(vec, ceiling(seq_along(vec)/10)))
# X1 X2
# 1 a b
# 2 a b
# 3 a b
# 4 a b
# 5 a b
# 6 a b
# 7 a b
# 8 a b
# 9 a b
# 10 a b

We could create a numeric index based on the '' values in the 'V2' column, split the dataset, use Reduce/merge to get the columns in the wide format.
indx <- cumsum(myData$V2=='')+1
res <- Reduce(function(...) merge(..., by= 'V1'), split(myData, indx))
res1 <- res[order(factor(res$V1, levels=myData[1:10, 1])),]
colnames(res1)[-1] <- paste0('Col', 1:3)
head(res1,3)
# V1 Col1 Col2 Col3
#2 ProductId B000179R3I B0000C3XXN B0000C3XX9
#4 product_title Amazon.com Amazon.com Amazon.com
#3 product_price unknown unknown unknown
From the p1.png, the 'V1' column can also be the column names for the values in 'V2'. If that is the case, we can 'transpose' the 'res1' except the first column and change the column names of the output with the first column of 'res1' (setNames(...))
res2 <- setNames(as.data.frame(t(res1[-1]), stringsAsFactors=FALSE),
res1[,1])
row.names(res2) <- NULL
res2[] <- lapply(res2, type.convert)
head(res2)
# ProductId product_title product_price userid
#1 B000179R3I Amazon.com unknown A3Q0VJTU04EZ56
#2 B0000C3XXN Amazon.com unknown A34JM8F992M9N1
#3 B0000C3XX9 Amazon.com unknown A34JM8F993MN91
# profileName helpfulness reviewscore review_time
#1 Jeanmarie Kabala "JP Kabala" 7/7 4 1182816000
#2 M. Shapiro 6/6 5 1205107200
#3 J. Cruze 8/8 5 120571929
# review_summary
#1 Periwinkle Dartmouth Blazer
#2 great classic jacket
#3 Good jacket
# review_text
#1 I own the Austin Reed dartmouth blazer in every color
#2 This is the second time I bought this jacket
#3 This is the third time I bought this jacket
I guess this is just a reshaping issue. In that case, we can use dcast from data.table to convert from long to wide format
library(data.table)
DT <- dcast(setDT(myData)[V1!=''][, N:= paste0('Col', 1:.N) ,V1], V1~N,
value.var='V2')
data
myData <- structure(list(V1 = c("ProductId", "product_title",
"product_price",
"userid", "profileName", "helpfulness", "reviewscore", "review_time",
"review_summary", "review_text", "", "ProductId", "product_title",
"product_price", "userid", "profileName", "helpfulness",
"reviewscore",
"review_time", "review_summary", "review_text", "", "ProductId",
"product_title", "product_price", "userid", "profileName",
"helpfulness",
"reviewscore", "review_time", "review_summary", "review_text"
), V2 = c("B000179R3I", "Amazon.com", "unknown", "A3Q0VJTU04EZ56",
"Jeanmarie Kabala \"JP Kabala\"", "7/7", "4", "1182816000",
"Periwinkle Dartmouth Blazer",
"I own the Austin Reed dartmouth blazer in every color", "",
"B0000C3XXN", "Amazon.com", "unknown", "A34JM8F992M9N1",
"M. Shapiro",
"6/6", "5", "1205107200", "great classic jacket",
"This is the second time I bought this jacket",
"", "B0000C3XX9", "Amazon.com", "unknown", "A34JM8F993MN91",
"J. Cruze", "8/8", "5", "120571929", "Good jacket",
"This is the third time I bought this jacket"
)), .Names = c("V1", "V2"), row.names = c(NA, 32L),
class = "data.frame")

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Combine unique rows from multiple data.tables and add attribute details - r

Related

cbind and match in data.frame structure

Merge data frames based on numeric rownames within a chosen threshold and keeping unmatched rows as well

Splitting data.table column into many unknow number of columns based on pattern

Divide list of columns by a second list of columns

Extracting Column data from .csv and turning every 10 consecutive rows into corresponding columns

Categories

Resources