I'm trying to reproduce a data frame and dput is not cooperating.
dput command :
dput(head(data, 10))
dput output :
structure(list(lexptot = c(8.28377505197124, 9.1595012302023,
8.14707583238833, 9.86330744180814, 8.21391453619232, 8.92372556833205,
7.77219149815994, 8.58202430280175, 8.34096828565733, 10.1133857229336
), year = c(0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L), dfmfdyr = c(0,
1, 0, 1, 0, 1, 0, 1, 0, 1), dfmfd98 = c(1, 1, 1, 1, 1, 1, 1,
1, 1, 1), nh = c(11054L, 11054L, 11061L, 11061L, 11081L, 11081L,
11101L, 11101L, 12021L, 12021L)), .Names = c("lexptot", "year",
"dfmfdyr", "dfmfd98", "nh"), vars = list(nh), drop = TRUE, indices = list(
0:1, 2:3, 4:5, 6:7, 8:9), group_sizes = c(2L, 2L, 2L, 2L,
2L), biggest_group_size = 2L, labels = structure(list(nh = c(11054L,
11061L, 11081L, 11101L, 12021L)), class = "data.frame", row.names = c(NA,
-5L), .Names = "nh", vars = list(nh)), row.names = c(NA, 10L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Error :
Error in structure(list(lexptot = c(8.28377505197124, 9.1595012302023, :
object 'nh' not found
Why is this happening right from a dput command?
Edit :
Relevant posts, but suggestions did not work.
Why does this dplyr dput not work?
Edit 2 :
It appears because one of my variables is a group object, dput cannot reproduce this. The solution is to use ungroup(data) then rerun dput and all works.
The issue was one of the variable objects was a group and therefore, dput() couldn't recognize this. The solution was to ungroup() the data.
ungroup(data)
dput(head(data, 10))
New Data.frame :
structure(list(lexptot = c(8.28377505197124, 9.1595012302023,
8.14707583238833, 9.86330744180814, 8.21391453619232, 8.92372556833205,
7.77219149815994, 8.58202430280175, 8.34096828565733, 10.1133857229336
), year = c(0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L), dfmfd98 = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1), dfmfd = c(0L, 1L, 0L, 1L, 1L, 1L,
1L, 1L, 1L, 1L)), .Names = c("lexptot", "year", "dfmfd98", "dfmfd"
), class = c("tbl_df", "data.frame"), row.names = c(NA, -10L))
Related
Below is the structure of a chunk which includes two elements of list1 and list2.
list1:
list1 <- list(structure(list(chr22_20230714_G_A_b38 = 0.0000953181301665087,
chr22_20230737_G_A_b38 = -0.00124036704551427, chr22_20231229_T_A_b38 = 0.000808061558738542,
chr22_20231474_G_A_b38 = 0.000387528601423933, chr22_20231667_C_G_b38 = -0.000120624028990859), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame")), structure(list(
chr22_47157062_G_A_b38 = 0.00000909931572319958, chr22_47157212_G_A_b38 = -0.000124084106569373,
chr22_47157394_C_G_b38 = -0.0000752774417069946, chr22_47157559_G_A_b38 = 0.0000808446315377557,
chr22_47157607_T_C_b38 = 0.000237979025556899), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame")))
list2:
list2 <- list(structure(list(name = c("HG00096", "HG00097", "HG00099",
"HG00100", "HG00101"), ENSG = c("ENSG00000040608", "ENSG00000040608",
"ENSG00000040608", "ENSG00000040608", "ENSG00000040608"), expr = c(-0.5186894,
0.6170779, -0.5786774, 0.07324268, -0.7579184), chr22_20230714_G_A_b38 = c(1L,
1L, 1L, 2L, 1L), chr22_20230737_G_A_b38 = c(0L, 0L, 0L, 0L, 0L
), chr22_20231229_T_A_b38 = c(1L, 0L, 1L, 0L, 1L), chr22_20231474_G_A_b38 = c(0L,
1L, 0L, 0L, 0L), chr22_20231667_C_G_b38 = c(1L, 1L, 1L, 2L, 1L
)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"
)), structure(list(name = c("HG00096", "HG00097", "HG00099",
"HG00100", "HG00101"), ENSG = c("ENSG00000054611", "ENSG00000054611",
"ENSG00000054611", "ENSG00000054611", "ENSG00000054611"), expr = c(-0.5555929,
0.1600335, 0.4027508, -0.6028474, 2.271097), chr22_47157062_G_A_b38 = c(0L,
1L, 0L, 0L, 0L), chr22_47157212_G_A_b38 = c(0L, 0L, 1L, 1L, 2L
), chr22_47157394_C_G_b38 = c(0L, 1L, 1L, 1L, 2L), chr22_47157559_G_A_b38 = c(0L,
1L, 0L, 0L, 0L), chr22_47157607_T_C_b38 = c(0L, 1L, 1L, 1L, 2L
)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"
)))
Both lists contain the same number and names of elements, as well as the same number of columns in each corresponding element. Using this assumption, I want to multiply the value of each column in list1 by the corresponding column in list2.
Desired output:
out <- list(structure(list(name = c("HG00096", "HG00097", "HG00099",
"HG00100", "HG00101"), ENSG = c("ENSG00000040608", "ENSG00000040608",
"ENSG00000040608", "ENSG00000040608", "ENSG00000040608"), expr = c(-0.5186894,
0.6170779, -0.5786774, 0.07324268, -0.7579184), chr22_20230714_G_A_b38 = c(0.0000953,
0.0000953, 0.0000953, 0.0001906, 0.0000953), chr22_20230737_G_A_b38 = c(0,
0, 0, 0, 0), chr22_20231229_T_A_b38 = c(0.000808, 0, 0.000808,
0, 0.000808), chr22_20231474_G_A_b38 = c(0, 0.000388, 0, 0, 0
), chr22_20231667_C_G_b38 = c(-0.000121, -0.000121, -0.000121,
-0.000242, -0.000121)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame")), structure(list(name = c("HG00096", "HG00097",
"HG00099", "HG00100", "HG00101"), ENSG = c("ENSG00000054611",
"ENSG00000054611", "ENSG00000054611", "ENSG00000054611", "ENSG00000054611"
), expr = c(-0.5555929, 0.1600335, 0.4027508, -0.6028474, 2.271097
), chr22_47157062_G_A_b38 = c(0, 0.0000091, 0, 0, 0), chr22_47157212_G_A_b38 = c(0,
0, -0.000124, -0.000124, -0.000248), chr22_47157394_C_G_b38 = c(0,
-0.0000753, -0.0000753, -0.0000753, -0.0001506), chr22_47157559_G_A_b38 = c(0,
0.0000808, 0, 0, 0), chr22_47157607_T_C_b38 = c(0, 0.000238,
0.000238, 0.000238, 0.000476)), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame")))
We could use map2 or Map in base R
library(dplyr)
library(tidyr)
outnew <- map2(list2, list1, ~ {
dat1 <- .y
.x %>% mutate(across(names(dat1), ~ .x * dat1[[cur_column()]] ))
})
I have this data acquired from a JSON routine, which includes 5 different records. Each record has either 41 or 0 sub-elements. In addition, sub-elements 40 and 41 are sub-lists.
I need to convert this into a main dataframe in which each record will show as a row with 39 columns (sub-lists can go away).
Also, records with no data (elements 2,3,4) will still show up as rows with NULL.
a = list(structure(list(ConOrden = 1L, TipoMed = 1L, TipoPrest = 2L,
CausaS1 = 0L, CausaS2 = 0L, CausaS3 = 0L, MedPBSUtilizado = NA,
RznCausaS31 = 0L, DescRzn31 = NA, RznCausaS32 = 0L, DescRzn32 = NA,
CausaS4 = 1L, MedPBSDescartado = NA, RznCausaS41 = 0L, DescRzn41 = NA,
RznCausaS42 = 0L, DescRzn42 = NA, RznCausaS43 = 1L, DescRzn43 = "N.A.",
RznCausaS44 = 0L, DescRzn44 = NA, CausaS5 = 1L, RznCausaS5 = NA,
CausaS6 = NA, DescMedPrinAct = "[APIXABAN] 5mg/1U", CodFF = "COLFF001",
CodVA = "048", JustNoPBS = "Paciente con infeccion por sarscov2",
Dosis = "5", DosisUM = "0168", NoFAdmon = "12", CodFreAdmon = 2L,
IndEsp = 10L, CanTrat = "3", DurTrat = 5L, CantTotalF = "180",
UFCantTotal = "66", IndRec = "Paciente con infeccion por sarscov2",
EstJM = 1L, PrincipiosActivos = list(structure(list(ConOrden = 1L,
CodPriAct = "08626", ConcCant = "5", UMedConc = "0168",
CantCont = "1", UMedCantCont = "0247"), class = "data.frame", row.names = 1L)),
IndicacionesUNIRS = list(list())), class = "data.frame", row.names = 1L),
list(), structure(list(), .Names = character(0), row.names = integer(0), class = "data.frame"),
structure(list(), .Names = character(0), row.names = integer(0), class = "data.frame"),
structure(list(ConOrden = 1L, TipoMed = 1L, TipoPrest = 2L,
CausaS1 = 0L, CausaS2 = 0L, CausaS3 = 0L, MedPBSUtilizado = NA,
RznCausaS31 = 0L, DescRzn31 = NA, RznCausaS32 = 0L, DescRzn32 = NA,
CausaS4 = 1L, MedPBSDescartado = "OXICODONA", RznCausaS41 = 1L,
DescRzn41 = "extreñimiento", RznCausaS42 = 0L, DescRzn42 = NA,
RznCausaS43 = 0L, DescRzn43 = NA, RznCausaS44 = 0L, DescRzn44 = NA,
CausaS5 = 1L, RznCausaS5 = NA, CausaS6 = NA, DescMedPrinAct = "[ACETAMINOFEN] ",
CodFF = "COLFF001", CodVA = "048", JustNoPBS = "dolor de dificil modulacion",
Dosis = "325", DosisUM = "0168", NoFAdmon = "8", CodFreAdmon = 2L,
IndEsp = 10L, CanTrat = "60", DurTrat = 3L, CantTotalF = "180",
UFCantTotal = "66", IndRec = "tomar una cada 8 horas ",
EstJM = 1L, PrincipiosActivos = list(structure(list(ConOrden = c(1L,
1L), CodPriAct = c("00626", "50055"), ConcCant = c("325",
"30"), UMedConc = c("0168", "0168"), CantCont = c("1",
"1"), UMedCantCont = c("0247", "0247")), class = "data.frame", row.names = 1:2)),
IndicacionesUNIRS = list(list())), class = "data.frame", row.names = 1L))
For each list elements return first 39 columns if number of rows in the data is greater than 0. If the element is empty return an empty tibble with a single column.
library(dplyr)
library(purrr)
result <- map_df(a, ~if(NROW(.x) > 0) .x %>% select(1:39)
else tibble(ConOrden = NA), .id = 'id')
dim(result)
#[1] 5 40
In the output we have 40 columns because the first column id is used to uniquely identify all the list element. For this example, id 2, 3 and 4 will have all columns as NA.
I have several dataframes that share the same structure but have different column names. I want to merge them all into one dataframe, but if i use bind_rows() it creates new column names.
I tried smartbind(), union() , union_all() and other libraries, however, none of them is able to simply merge them.
Here goes some sample data:
df1 <- structure(list(Codigo_Cliente = c(292640L, 48296L, 28368L, 27631L,
21715L, 401076L), Segmento = structure(c(3L, 3L, 3L, 3L, 3L,
5L), .Label = c("Clasico", "Emergente", "Mi_Negocio", "Preferencial",
"Prestige"), class = "factor"), Sal_Cons_CA_2018 = c(115966976.4748,
41404074.5338, 21576406.4326, NA, 5217387.0461, NA), Sal_Cons_CA_2019 = c(233057582.7658,
146012775.8314, 121273292.4548, 72383484.8781, 76605696.1462,
64418761.5503), Tipo_Cliente = structure(c(2L, 2L, 2L, 2L, 2L,
1L), .Label = c("Nuevo", "Viejo"), class = "factor"), diferencia_anual = c(117090606.291,
104608701.2976, 99696886.0222, 72383484.8781, 71388309.1001,
64418761.5503), peso_cambio = c(11.7925653553277, 10.5354732191076,
10.040788765049, 7.28996973463426, 7.18974243396645, 6.48781725327502
), cum = c(117090606.291, 221699307.5886, 321396193.6108, 393779678.4889,
465167987.589, 529586749.1393), cum_cambio = c(11.7925653553277,
22.3280385744352, 32.3688273394842, 39.6587970741185, 46.8485395080849,
53.33635676136), ones = c(1, 1, 1, 1, 1, 1), clientes = c(1,
2, 3, 4, 5, 6), porcentaje_acumulado_clientes = c(0.040650406504065,
0.0813008130081301, 0.121951219512195, 0.16260162601626, 0.203252032520325,
0.24390243902439), Tipo_Aportante = c("Viejo Aportante", "Viejo Aportante",
"Viejo Aportante", "Nuevo Aportante", "Viejo Aportante", "Nuevo Aportante"
)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L), groups = structure(list(Codigo_Cliente = c(21715L, 27631L,
28368L, 48296L, 292640L, 401076L), Segmento = structure(c(3L,
3L, 3L, 3L, 3L, 5L), .Label = c("Clasico", "Emergente", "Mi_Negocio",
"Preferencial", "Prestige"), class = "factor"), .rows = list(
5L, 4L, 3L, 2L, 1L, 6L)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE))
df2 <- structure(list(Codigo_Cliente = c(29460L, 208833L, 494610L, 292653L,
371679L, 54042L), Segmento = structure(c(3L, 3L, 3L, 3L, 3L,
3L), .Label = c("Clasico", "Emergente", "Mi_Negocio", "Preferencial",
"Prestige"), class = "factor"), Sal_Cons_CC_2018 = c(249412694.49,
226519.47, NA, 232072.25, 893861.14, 2305969.41), Sal_Cons_CC_2019 = c(492333714.52,
217220231.86, 140551673.22, 73744015.83, 57995686.81, 54669407.01
), Tipo_Cliente = structure(c(2L, 2L, 1L, 2L, 2L, 2L), .Label = c("Nuevo",
"Viejo"), class = "factor"), diferencia_anual = c(242921020.03,
216993712.39, 140551673.22, 73511943.58, 57101825.67, 52363437.6
), peso_cambio = c(30.7889911838579, 27.5028381525124, 17.8142024395939,
9.31726115143663, 7.23736301995891, 6.63679667747068), cum = c(242921020.03,
459914732.42, 600466405.64, 673978349.22, 731080174.89, 783443612.49
), cum_cambio = c(30.7889911838579, 58.2918293363703, 76.1060317759641,
85.4232929274008, 92.6606559473597, 99.2974526248303), ones = c(1,
1, 1, 1, 1, 1), clientes = c(1, 2, 3, 4, 5, 6), porcentaje_acumulado_clientes = c(0.0369822485207101,
0.0739644970414201, 0.11094674556213, 0.14792899408284, 0.18491124260355,
0.22189349112426), Tipo_Aportante = c("Viejo Aportante", "Viejo Aportante",
"Nuevo Aportante", "Viejo Aportante", "Viejo Aportante", "Viejo Aportante"
)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L), groups = structure(list(Codigo_Cliente = c(29460L, 54042L,
208833L, 292653L, 371679L, 494610L), Segmento = structure(c(3L,
3L, 3L, 3L, 3L, 3L), .Label = c("Clasico", "Emergente", "Mi_Negocio",
"Preferencial", "Prestige"), class = "factor"), .rows = list(
1L, 6L, 2L, 4L, 5L, 3L)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE))
You can use data.table package, which has rbindlist function:
df <- rbindlist(list(df1,df2), use.names = T)
I have a dataset called dietox which has missing values (NA) for the Feed variable. I need to use conditional selection to create a subset of the data for which the rows with missing values are deleted.
The code I tried was:
dietox[!is.NA[dietox$Feed, ]
... but am not sure if that is right to create a subset.
dput(head(dietox))
dietox <- structure(list(Weight = c(26.5, 27.59999, 36.5, 40.29999, 49.09998,
55.39999), Feed = c(NA, 5.200005, 17.6, 28.5, 45.200001, 56.900002 ),
Time = 1:6, Pig = c(4601L, 4601L, 4601L, 4601L, 4601L, 4601L ),
Evit = c(1L, 1L, 1L, 1L, 1L, 1L), Cu = c(1L, 1L, 1L, 1L, 1L, 1L),
Litter = c(1L, 1L, 1L, 1L, 1L, 1L)),
.Names = c("Weight", "Feed", "Time", "Pig", "Evit", "Cu", "Litter"),
row.names = c(NA, 6L), class = "data.frame")
You have the right idea, but is.na is a function and so needs to be used with parenthesis.
dietox[!is.na(dietox$Feed), ]
I'm trying to create a cumulative graph as shown here, with another caveat. The steps should be based 2 minute time intervals, whereby an interval may have multiple or even no entries.
I used rowSums to create the column for the value to be used in cumsum,
e.g.,
df_so $intraverbal <- rowSums(df_so[-1] == "intraverbal")
df_so$tact <- rowSums(df_so[-1] == "tact")
df_so$mand <- rowSums(df_so[-1] == "mand")
df_so$echoic <- rowSums(df_so[-1] == "echoic")
The graph worked out well enough using plot:
plot(cumsum(df_so$intraverbal), type="s")
However, there are a couple ways it falls short. Ideally, the data would be tallied and labeled according to the "time bin". At the very least, the time bins should be on the x-label, but the increments aren't continuous. Hypothetically, I should be using dplyr or lapply to melt and combine them - but I'm not sure how. Perhaps, something as described here.
It would be nice to accomplish this with ggplot, so that the varying cumsums can be on the same graph, e.g., like here, or perhaps with stat_bin as here.
Here's a small working sample of the data:
df_so <- structure(list(time.bin = structure(c(1L, 1L, 1L, 1L, 1L, 1L,1L, 124L, 124L, 124L), .Label = c("0:00:00", "0:02:00", "0:04:00","0:06:00", "0:08:00", "0:10:00", "0:12:00", "0:14:00", "0:16:00","0:18:00",
"0:20:00", "0:22:00", "0:24:00", "0:26:00", "0:28:00","0:30:00", "0:32:00", "0:34:00", "0:36:00", "0:38:00", "0:40:00","0:42:00", "0:44:00", "0:46:00", "0:48:00", "0:50:00", "0:52:00","0:54:00", "0:56:00", "0:58:00",
"1:00:00", "1:02:00", "1:04:00","1:06:00", "1:08:00", "1:10:00", "1:12:00", "1:14:00", "1:16:00","1:18:00", "1:20:00", "1:22:00", "1:24:00", "1:26:00", "1:28:00","1:30:00", "1:32:00", "1:34:00", "1:36:00", "1:38:00",
"1:40:00","1:42:00", "1:44:00", "1:46:00", "1:48:00", "1:50:00", "1:52:00","1:54:00", "1:56:00", "1:58:00", "2:00:00", "2:02:00", "2:04:00","2:06:00", "2:08:00", "2:10:00", "2:12:00", "2:14:00", "2:16:00","2:18:00",
"2:20:00", "2:22:00", "2:24:00", "2:26:00", "2:28:00","2:30:00", "2:32:00", "2:34:00", "2:36:00", "2:38:00", "2:40:00","2:42:00", "2:44:00", "2:46:00", "2:48:00", "2:50:00", "2:52:00","2:54:00", "2:56:00", "2:58:00",
"3:00:00", "3:02:00", "3:04:00","3:06:00", "3:08:00", "3:10:00", "3:12:00", "3:14:00", "3:16:00","3:18:00", "3:20:00", "3:22:00", "3:24:00", "3:26:00", "3:28:00","3:30:00", "3:32:00", "3:34:00", "3:36:00", "3:38:00", "3:40:00","3:42:00", "3:44:00", "3:48:00", "3:50:00", "3:52:00", "3:54:00","3:56:00", "3:58:00", "4:00:00", "4:02:00", "4:04:00", "4:06:00","4:08:00"), class = "factor"),
Primary.VB = structure(c(1L,3L, 1L, 3L, 1L, 3L, 1L, 1L, 1L, 1L), .Label = c("", "echoic","intraverbal", "mand", "tact"), class = "factor"),
Secondary.VB = structure(c(1L,1L, 1L, 5L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "echoic","intraverbal", "mand", "tact"), class = "factor"),
Tertiary.VB = structure(c(1L,1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "intraverbal","mand", "tact"), class = "factor"), intraverbal = c(0, 1, 0,1, 0, 1, 0, 0, 0, 0),
tact = c(0, 0, 0, 1, 0, 0, 0, 0, 0, 0),mand = c(0, 0, 0, 1, 0, 0, 0, 0, 0, 0),
echoic = c(0, 0,0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("time.bin", "Primary.VB","Secondary.VB","Tertiary.VB","intraverbal",
"tact", "mand", "echoic"), row.names = c(1L, 2L,3L, 4L, 5L, 6L, 7L, 1648L, 1649L, 1650L), class = "data.frame")
Not an answer, just and extended comment that I'll delete. If we ignore for a second that the x axis represents the factor numbers...does it look alright?
tbl_df(df_so) %>%
group_by(time.bin) %>%
mutate(Csum=cumsum(intraverbal)) %>%
summarise(last=last(Csum)) %>%
mutate(tCsum=cumsum(last)) %>%
mutate(time.bin=as.numeric(time.bin)) %>%
ggplot(., aes(time.bin, tCsum))+
geom_step()