I know the subject is widely covers but I didn't find the code working for my case... I have a dataframe of this type:
V1 V2 V3
1: label1 alias_fr alias_fr
2: label1 triplet triplet
3: label1 Q9327 Q3122270
4: label2 NULL NULL
5: label3 alias_fr NULL
6: label3 triplet NULL
7: label3 Q678 NULL
This dataframe is generated after mapping a json output to a query from a df input:
df <- Map(rbind, originalDF$input,out) #I first used Map(c,..) but it seems to be more difficult to reshape than rbind
df <- rbind.fill(lapply(df,function(y){as.data.frame(t(y),stringsAsFactors=FALSE)}))
class(df)
[1] "data.frame"
The example given is simplified though, as I have more than 3 columns, and some values are lists. Nevertheless when I have non-NULL values for a label I have always the same number of rows within a column (3 in my example: alias_fr, triplet, Qxx).
And I would like to have V2 and V3 values in row for each V1 value:
V1 var1 var2 var3
label1 alias_fr triplet Q9327
label1 alias_fr triplet Q3122270
label2 NULL NULL NULL
label3 alias_fr triplet Q678
I try to start melt: melt(df,id="V1"), but then I am stuck.
I also tried reshape, cast, dcast, without any success, and I am more and more confuse with all reshaping stuff... If a reshape master is around, I would be very gratefull ;)
[Edit]: real objects to clarify my issue
Ok so this is an extract of the real dataset I’m working with:
#original dataset (actually it’s one column of the dataset)
originalDF <- c("Guy de Maupassant", "J.-J. Goldman", "Poitou-Charentes")
#output of the API query from the text in the orginalDF
out <- list(structure(list(`_index` = c("alias_fr", "alias_fr"), `_type` = c("triplet",
"triplet"), `_id` = c("Q9327", "Q3122270"), `_score` = c(NA,
NA), sort = list(-4.95263021255079, -6.65910164747673), `_source.types` = list(
structure(list(id = c("Q5", "dbPedia.Person"), value = c("être humain",
"personne")), .Names = c("id", "value"), class = "data.frame", row.names = 1:2),
structure(list(id = c("Q11424", "dbPedia.Film"), value = c("film",
"film")), .Names = c("id", "value"), class = "data.frame", row.names = 1:2)),
`_source.pageRank` = c(-4.95263021255079, -6.65910164747673
), `_source.subTypes` = list(structure(list(id = c("Q1930187",
"Q36180", "Q15949613", "Q6625963", "Q214917"), value = c("journaliste",
"écrivain", "nouvelliste", "romancier", "dramaturge")), .Names = c("id",
"value"), class = "data.frame", row.names = c(NA, 5L)), NULL),
`_source.label` = c("Guy de Maupassant", "Guy de Maupassant"
), `_source.id` = c("Q9327", "Q3122270")), .Names = c("_index",
"_type", "_id", "_score", "sort", "_source.types", "_source.pageRank",
"_source.subTypes", "_source.label", "_source.id"), class = "data.frame", row.names = 1:2),
list(), structure(list(`_index` = "alias_fr", `_type` = "triplet",
`_id` = "Q17009", `_score` = NA, sort = list(-5.0448283638424),
`_source.types` = list(structure(list(id = "Q22670030",
value = "ancienne région française"), .Names = c("id",
"value"), class = "data.frame", row.names = 1L)), `_source.pageRank` = -5.0448283638424,
`_source.label` = "Poitou-Charentes", `_source.id` = "Q17009"), .Names = c("_index",
"_type", "_id", "_score", "sort", "_source.types", "_source.pageRank",
"_source.label", "_source.id"), class = "data.frame", row.names = 1L))
#df object (generated from Map, then rbind.fill)
df <- structure(list(V1 = list("Guy de Maupassant", "Guy de Maupassant",
"Guy de Maupassant", "Guy de Maupassant", "Guy de Maupassant",
"Guy de Maupassant", "Guy de Maupassant", "Guy de Maupassant",
"Guy de Maupassant", "Guy de Maupassant", "J.-J. Goldman",
"Poitou-Charentes", "Poitou-Charentes", "Poitou-Charentes",
"Poitou-Charentes", "Poitou-Charentes", "Poitou-Charentes",
"Poitou-Charentes", "Poitou-Charentes", "Poitou-Charentes"),
V2 = list("alias_fr", "triplet", "Q9327", NA_character_,
-4.95263021255079, structure(list(id = c("Q5", "dbPedia.Person"
), value = c("être humain", "personne")), .Names = c("id",
"value"), class = "data.frame", row.names = 1:2), "-4.95263021255079",
structure(list(id = c("Q1930187", "Q36180", "Q15949613",
"Q6625963", "Q214917"), value = c("journaliste", "écrivain",
"nouvelliste", "romancier", "dramaturge")), .Names = c("id",
"value"), class = "data.frame", row.names = c(NA, 5L)),
"Guy de Maupassant", "Q9327", NULL, "alias_fr", "triplet",
"Q17009", NA_character_, -5.0448283638424, structure(list(
id = "Q22670030", value = "ancienne région française"), .Names = c("id",
"value"), class = "data.frame", row.names = 1L), "-5.0448283638424",
"Poitou-Charentes", "Q17009"), V3 = list("alias_fr",
"triplet", "Q3122270", NA_character_, -6.65910164747673,
structure(list(id = c("Q11424", "dbPedia.Film"), value = c("film",
"film")), .Names = c("id", "value"), class = "data.frame", row.names = 1:2),
"-6.65910164747673", NULL, "Guy de Maupassant", "Q3122270",
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL), V4 = list(NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL)), .Names = c("V1", "V2",
"V3", "V4"), row.names = c(NA, 20L), class = "data.frame")
For your particular example (each label has a maximum of three rows), one approach would be
require(dplyr)
df <- data.frame(label = c(rep('a',3),'b', rep('c',3)), id1 = c(1,2,3,NA, 1,2,4), id2 = c(1,2,5,NA,NA,NA,NA))
#I used different names than you, because I have the impression that each column belongs to another measurement or so
df1 <- select(df, label, id1)#subsets your df into two data frames
df2 <- select(df, label, id2)
df1a <- df1 %>% group_by(label) %>% summarise (var1 = id1[1], var2 = id1[2], var3 = id1[3])
df2b <- df2 %>% group_by(label) %>% summarise (var1 = id2[1], var2 = id2[2], var3 = id2[3])
#this groups the columns after your label and then you can force the creation of NA values for the observations that do not have a row in your df
df_final <- rbind(df1a,df2b)
> df_final
# A tibble: 6 x 4
label var1 var2 var3
<fctr> <dbl> <dbl> <dbl>
1 a 1 2 3
2 b NA NA NA
3 c 1 2 4
4 a 1 2 5
5 b NA NA NA
6 c NA NA NA
I know that this is not elegant, and not generalisable, because you manually assign a new column for each row that you have/ do not have in your df, but it should work with your example.
Related
I've got a list of dataframes. I'd like to cbind them by the index column, sample_id. Each table has the same column headings, so I can't just cbind them otherwise I won't know which list item the columns came from. The name of the list item gives the measure used to generate them, so I'd like to suffix the column headings with the list item name.
Here's a simplified demo list of dataframes:
list_of_tables <- list(number = structure(list(sample_id = structure(1:3, levels = c("CSF_1",
"CSF_2", "CSF_4"), class = "factor"), total = c(655, 331, 271
), max = c(12, 5, 7)), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame")), concentration_cm_3 = structure(list(sample_id = structure(1:3, levels = c("CSF_1",
"CSF_2", "CSF_4"), class = "factor"), total = c(121454697, 90959097,
43080697), max = c(2050000, 2140000, 915500)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")), volume_nm_3 = structure(list(
sample_id = structure(1:3, levels = c("CSF_1", "CSF_2", "CSF_4"
), class = "factor"), total = c(2412783009, 1293649395, 438426087
), max = c(103500000, 117400000, 23920000)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")), area_nm_2 = structure(list(
sample_id = structure(1:3, levels = c("CSF_1", "CSF_2", "CSF_4"
), class = "factor"), total = c(15259297.4, 7655352.2, 3775922
), max = c(266500, 289900, 100400)), row.names = c(NA, -3L
), class = c("tbl_df", "tbl", "data.frame")))
You'll see it's a list of 4 tables, and the list item names are "number", "concentration_cm_3", "volume_nm_3", and "area_nm_2".
Using join_all from plyr I can merge them all by sample_id. However, how do I suffix with the list item name?
merged_tables <- plyr::join_all(stats_by_measure, by = "sample_id", type = "left")
we could do it this way:
The trick is to use .id = 'id' in bind_rows which adds the name as a column. Then we could pivot:
library(dplyr)
library(tidyr)
bind_rows(list_of_tables, .id = 'id') %>%
pivot_wider(names_from = id,
values_from = c(total, max))
sample_id total_number total_concentration_cm_3 total_volume_nm_3 total_area_nm_2 max_number max_concentration_cm_3 max_volume_nm_3 max_area_nm_2
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CSF_1 655 121454697 2412783009 15259297. 12 2050000 103500000 266500
2 CSF_2 331 90959097 1293649395 7655352. 5 2140000 117400000 289900
3 CSF_4 271 43080697 438426087 3775922 7 915500 23920000 100400
Probably, we may use reduce2 here with suffix option from left_join
library(dplyr)
library(purrr)
nm <- names(list_of_tables)[1]
reduce2(list_of_tables, names(list_of_tables)[-1],
function(x, y, z) left_join(x, y, by = 'sample_id', suffix = c(nm, z)))
Or if we want to use join_all, probably we can rename the columns before doing the join
library(stringr)
imap(list_of_tables, ~ {
nm <- .y
.x %>% rename_with(~str_c(.x, nm), -1)
}) %>%
plyr::join_all( by = "sample_id", type = "left")
Or use a for loop
tmp <- list_of_tables[[1]]
names(tmp)[-1] <- paste0(names(tmp)[-1], names(list_of_tables)[1])
for(nm in names(list_of_tables)[-1]) {
tmp2 <- list_of_tables[[nm]]
names(tmp2)[-1] <- paste0(names(tmp2)[-1], nm)
tmp <- left_join(tmp, tmp2, by = "sample_id")
}
tmp
The value in another data frame becomes NA after I used left_join() function. And I check the answer at here[dplyr::left_join produce NA values for new joined columns.
I also specify the by arguement but failed.
I don't know why.
qx_p2 <- structure(list(province = c("安徽", "安徽", "安徽", "安徽", "安徽"
), date = c("2020-01-21", "2020-01-22", "2020-01-23", "2020-01-24",
"2020-01-25"), PRS = c(1013.9035387141, 1011.48779584751, 1014.28302402211,
1019.16970261716, 1018.92203467498), PRS_Sea = c(1024.73084750567,
1022.22210612717, 1025.02632842026, 1029.97905104403, 1029.77650132275
), PRS_Max = c(1014.26828869048, 1011.80445613662, 1014.51628117914,
1019.43671957672, 1019.31935504063), PRS_Min = c(1013.7138513322,
1011.13447054516, 1013.86811271731, 1018.75406934996, 1018.62469257842
), WIN_S_Max = c(2.30187606292517, 2.08586132369615, 2.76893908257748,
4.22074853552532, 3.63427225056689), WIN_S_Inst_Max = c(3.44360343442933,
3.09963836923658, 4.28499952758881, 6.68930898053666, 5.80619165721844
), WIN_D_INST_Max = c(116.878029336735, 218.745851048753, 120.88310303288,
72.1640447845805, 72.0331526360544), WIN_D_Avg_2mi = c(116.23329724764,
210.524530689871, 113.104009452075, 68.7694017991261, 70.322008604388
), WIN_S_Avg_2mi = c(1.77558118386243, 1.49959490740741, 2.20936874055178,
3.47942613851096, 2.99431642101285), WIN_D_S_Max = c(116.68018866665,
218.180671371681, 120.40502999811, 71.0831467309146, 68.3670670351474
), TEM = c(3.81968088624339, 5.16464226662887, 6.82721856103553,
5.98099596088435, 4.8940626181028), TEM_Max = c(4.04776301492819,
5.35075514928193, 6.97597470238095, 6.15192401266062, 5.07960293839758
), TEM_Min = c(3.49020455404384, 4.95346053004535, 6.65049142573696,
5.85618067365835, 4.76455794123205), RHU = c(85.9359859221466,
96.1710766250945, 91.749678760393, 88.3347741874528, 80.693040202192
), VAP = c(6.98015376984127, 8.55406509826153, 9.08114866780046,
8.27843124055178, 6.98599714191232), RHU_Min = c(83.965092356387,
95.6411387471655, 90.9997401738473, 87.3134436413454, 79.2219635770975
), PRE_1h = c(0.102133763227513, 0.422205333522298, 1488.33246492347,
0.0715384070294785, 372.116791028911)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -5L))
covid_p2 <- structure(list(province = c("安徽", "安徽", "安徽", "安徽", "安徽"
), date = c("2020/1/21", "2020/1/22", "2020/1/23", "2020/1/24",
"2020/1/25"), 新增确诊 = c(0L, 1L, 14L, 24L, 21L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -5L))
dat2 <- covid_p2 %>% left_join(qx_p2, by = c('province' = 'province', 'date' = 'date'))
dat2
Your date columns are character columns and do not have the same format:
qx_p2$date
# "2020-01-21" "2020-01-22" "2020-01-23" "2020-01-24" "2020-01-25"
covid_p2$date
# "2020/1/21" "2020/1/22" "2020/1/23" "2020/1/24" "2020/1/25"
You can get them in the same format by, for example, applying as.Date(...):
covid_p2$date <- as.Date(covid_p2$date)
qx_p2$date <- as.Date(qx_p2$date)
After that, your join works.
I realize my title is probably a little confusing. I have some JSON that is a little confusing to unnest. I am trying to use the tidyverse.
Sample Data
df <- structure(list(long_abbr = c("Team11", "BBS"), short_name = c("Ac ",
"BK"), division = c("", ""), name = c("AC Slaters Muscles", "Broken Bats"
), abbr = c("T1", "T1"), owners = list(structure(list(commissioner = 0L,
name = "Chris Liss", id = "300144F8-79F4-11EA-8F25-9AE405472731"), class = "data.frame", row.names = 1L),
structure(list(commissioner = 1L, name = "Mark Ortin", id = "90849EF6-7427-11EA-95AA-4EEEAC7F8CD2"), class = "data.frame", row.names = 1L)),
id = c("1", "2"), logged_in_team = c(NA_integer_, NA_integer_
)), row.names = 1:2, class = "data.frame")
)
# Unnest Owners Information
df <- df %>%
unnest(owners)
I get the following error since I have duplicate columns that use name.
Error: Column names `name` and `id` must not be duplicated.
Is there an easy way to unnest the columns with a naming convention that takes the prefix owners (or in my case, I'd want it to take whatever the name of the column that hold the nested df is) before the nested columns. I.E. owners.commissioner, owners.name, owners.id. I'd also be interested in solutions that use camel case, and an underscore. I.E. ownersName, or owners_name.
set the argument names_sep:
df <- structure(
list(long_abbr = c("Team11", "BBS"),
short_name = c("Ac ", "BK"),
division = c("", ""),
name = c("AC Slaters Muscles", "Broken Bats"),
abbr = c("T1", "T1"),
owners = list(
structure(list(commissioner = 0L, name = "Chris Liss",
id = "300144F8-79F4-11EA-8F25-9AE405472731"),
class = "data.frame", row.names = 1L),
structure(list(commissioner = 1L, name = "Mark Ortin",
id = "90849EF6-7427-11EA-95AA-4EEEAC7F8CD2"),
class = "data.frame", row.names = 1L)),
id = c("1", "2"),
logged_in_team = c(NA_integer_, NA_integer_)),
row.names = 1:2, class = "data.frame"
)
tidyr::unnest(df, owners, names_sep = "_")
#> # A tibble: 2 x 10
#> long_abbr short_name division name abbr owners_commissi… owners_name
#> <chr> <chr> <chr> <chr> <chr> <int> <chr>
#> 1 Team11 "Ac " "" AC S… T1 0 Chris Liss
#> 2 BBS "BK" "" Brok… T1 1 Mark Ortin
#> # … with 3 more variables: owners_id <chr>, id <chr>, logged_in_team <int>
Created on 2020-04-26 by the reprex package (v0.3.0)
Does this solve your problem?
I'm basically .net developer one of my project page needs data from MongoDB collection. But I need to compare each column value so I used R language. I have retrieved data from MongoDB but some of columns have a list of variables so I'm not able to compare each column. Can you help me to separate the values of the list column and add a new column with the same name of list variable?
If its possible to use any algorithm to solve means it's more preferable.
My sample Data frame(Data set)
ID UserDetails CompanyDetails
1 list(UserID = 247891,Useraltr="Admin",UsercumEmpdetaisl=list(list(FirstName="Jack",LastName="De"))) list(ComyAddress="4/8 9 Block UD",ComyReg="344/88 7 Cross UK")
2 list(UserID=c(247891,256134),Useraltr=c("Admin","SuperAdmin"),UsercumEmpdetaisl=list(list(FirstName=c("peter","jhon","Vector"),LastName =c("Anderson","VJ","PK")))) list(ComyAddress =c("1BLOCK","2BLOCK"),ComyReg=c("1MainRoad","3street"),LandMark =c("Near post Office","check post"))
Result data frame
ID UserID Useraltr FirstName LastName ComyAddress ComyReg LandMark
1 247891 Admin Jack De 4/8 9 Block UD 344/88 7 Cross UK Empty(NULL)
2 247891,256134 Admin,SuperAdmin peter,jhon,Vector Anderson,VJ,PK 1BLOCK,2BLOCK 1MainRoad,3street Near post Office,check post
data frame dput data for first 2 row.
structure(list(ID = c("1", "2"), UserDetails = list(structure(list(
UserID = 247891, Useraltr = 'Admin', UsercumEmpdetaisl = list(structure(list(
FirstName = "Jack", LastName ="De" ), .Names = c("FirstName", "LastName"
), class = "data.frame", row.names = 1L))), .Names = c("UserID",
"Useraltr", "UsercumEmpdetaisl"), class = "data.frame", row.names = 1L),
structure(list(UserID = c(247891,256134), Useraltr = c('Admin','SuperAdmin'), UsercumEmpdetaisl = list(
structure(list(FirstName = c("peter", "jhon", "Vector"), LastName = c("Anderson",
"VJ","PK")), .Names = c("FirstName", "LastName"), class = "data.frame", row.names = 1L))), .Names = c("UserID",
"Useraltr", "UsercumEmpdetaisl"), class = "data.frame", row.names = 1L)),
CompanyDetails = list(structure(list(ComyAddress = "4/8 9 Block UD"
, ComyReg = "344/88 7 Cross UK"), .Names = c("ComyAddress", "ComyReg"
), class = "data.frame", row.names = 1:2), structure(list(
ComyAddress = c("1BLOCK","2BLOCK"), ComyReg = c("1MainRoad","3 street"
),LandMark=c("Near post Office","check post")), .Names = c("ComyAddress", "ComyReg","LandMark"), class = "data.frame", row.names = 1:2))), .Names = c("ID",
"UserDetails", "CompanyDetails"), row.names = 1:2, class = "data.frame")
I have a dataframe (test) in R. Inside one of the columns contains coordinates in this list structure:
> dput(test$coordinates)
list(structure(list(x = c(-1.294832, -1.294883, -1.294262,
-1.249478), y = c(54.61024, 54.61008, 54.610016, 54.610006
)), .Names = c("x", "y"), row.names = c(NA, -284L), class = c("tbl_df",
"tbl", "data.frame")))
I've reduced the number of coordinates for clarity.
Ultimately I wish to convert the dataframe into a spaitial lines dataframe but to do that I need the test$coordinates in a lines form. However, I get the following error
> lines(test$coordinates)
Error in xy.coords(x, y) :
'x' is a list, but does not have components 'x' and 'y'
I have tried to convert the test$coordinates to other forms but it usually results in some error. How do I transform this list into a line?
Extra info this is a follow up question to
Convert data frame to spatial lines data frame in R with x,y x,y coordintates
UPDATE as requested dput(head(test)):
> dput(head(test))
structure(list(rid = 1, start_id = 1L, start_code = "E02002536",
end_id = 106L, end_code = "E02006909", strategy = "fastest",
distance = 12655L, time_seconds = 2921L, calories = 211L,
document.id = 1L, array.index = 1L, start = "Geranium Close",
finish = "Hylton Road", startBearing = 0, startSpeed = 0,
start_longitude = -1.294832, start_latitude = 54.610241,
finish_longitude = -1.249478, finish_latitude = 54.680691,
crow_fly_distance = 8362, event = "depart", whence = 1473171787,
speed = 20, itinerary = 419956, clientRouteId = 0, plan = "fastest",
note = "", length = 12655, time = 2921, busynance = 42172,
quietness = 30, signalledJunctions = 3, signalledCrossings = 2,
west = -1.300074, south = 54.610006, east = -1.232447, north = 54.683814,
name = "Geranium Close to Hylton Road", walk = 0, leaving = "2016-09-06 15:23:07",
arriving = "2016-09-06 16:11:48", grammesCO2saved = 2359,
calories2 = 211, type = "route", coordinates = list(structure(list(
x = c(-1.294832, -1.294883, -1.294262, -1.294141, -1.29371,
-1.293726, -1.293742, -1.29351, -1.293368, -1.292816,
-1.248019, -1.249478), y = c(54.61024, 54.61008, 54.610016,
54.610006, 54.610038, 54.610142, 54.610247, 54.610262,
54.681238, 54.680975, 54.680601, 54.680404
)), .Names = c("x", "y"), row.names = c(NA, -284L), class = c("tbl_df",
"tbl", "data.frame")))), .Names = c("rid", "start_id", "start_code",
"end_id", "end_code", "strategy", "distance", "time_seconds",
"calories", "document.id", "array.index", "start", "finish",
"startBearing", "startSpeed", "start_longitude", "start_latitude",
"finish_longitude", "finish_latitude", "crow_fly_distance", "event",
"whence", "speed", "itinerary", "clientRouteId", "plan", "note",
"length", "time", "busynance", "quietness", "signalledJunctions",
"signalledCrossings", "west", "south", "east", "north", "name",
"walk", "leaving", "arriving", "grammesCO2saved", "calories2",
"type", "coordinates"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))
lines is a plotting function. I'm assuming you want sp::SpatialLines. See ?"SpatialLines-class" for how to construct such an object.
Here's for your case, provided you don't have a "corrupt" data.frame (see at the bottom of this post).
library(sp)
coords <- as.data.frame(xy$coordinates[[1]])[1:12, ]
out <- SpatialLines(list(Lines(list(Line(coords)), ID = 1)))
An object of class "SpatialLines"
Slot "lines":
[[1]]
An object of class "Lines"
Slot "Lines":
[[1]]
An object of class "Line"
Slot "coords":
x y
1 -1.294832 54.61024
2 -1.294883 54.61008
3 -1.294262 54.61002
4 -1.294141 54.61001
5 -1.293710 54.61004
6 -1.293726 54.61014
7 -1.293742 54.61025
8 -1.293510 54.61026
9 -1.293368 54.68124
10 -1.292816 54.68097
11 -1.248019 54.68060
12 -1.249478 54.68040
Slot "ID":
[1] "1"
Slot "bbox":
min max
x -1.294883 -1.248019
y 54.610006 54.681238
Slot "proj4string":
CRS arguments: NA
To add data to this object, you should use
SpatialLinesDataFrame(out, data = yourdata)
but see this example for more info.
There's a warning when I tried to coerce your coordinates to a data.frame. Hopefully this isnt' the case for your dataset.
> as.data.frame(xy$coordinates[[1]])
x y
1 -1.294832 54.61024
2 -1.294883 54.61008
3 -1.294262 54.61002
...
281 <NA> <NA>
282 <NA> <NA>
283 <NA> <NA>
284 <NA> <NA>
Warning message:
In format.data.frame(x, digits = digits, na.encode = FALSE) :
corrupt data frame: columns will be truncated or padded with NAs