Related
I have two datasets:
one is actual count and other one is predicted counts. I want to do a pearson correlation between them.
My actual count data look like this:
My predicted counts data look like this:
I want to do pearson correlation for these two datasets for each geneID.
I have written this code:
install.packages("Rcpp")
library(Rcpp)
library("reshape2")
library("ggplot2")
# import in the actual expression values and the gene predicted values
act_cts <- read.delim("GVDS_normalized_counts_2021v1.txt", header = TRUE, sep="\t")
## fix the column names
colnames(act_cts)[1]<-"gene"
colnames(act_cts)<- substr(colnames(act_cts), 1, 7)
pred_cts<-read.delim("GVDS_PrediXcan_Test_2021v1.txt", header=TRUE, sep="\t")
colnames(pred_cts)<-substr(colnames(pred_cts), 1, 15)
## melt the predict counts, so the columns change to row entries FID, IID, gene
melt_pred_cts<-melt(pred_cts, id.vars=c("FID","IID"), variable.name="gene", value.name = "gene_exp")
## melts the actual counts, so it can be easily joined to the final prediction
melt_act_cts<-melt(act_cts, id.vars="gene", variable.name="IID", value.name = "act_gene_exp")
final_cts<-merge(melt_pred_cts,melt_act_cts)
## this takes a minute/ several minutes to run because it is joining on both gene and IID
# runs the Pearson correlation for each gene
all_genes<-unique(final_cts$gene)
pear_cor_all_df<- data.frame(gene=character(), pear_coeff=double())
## runs the correlation
for(g in all_genes)
{
wrk_cts_all<-final_cts[which(final_cts$gene==g),]
# temp working df for each gene
pear_coef_all<-cor(wrk_cts_all$gene_exp, wrk_cts_all$act_gene_exp, method="pearson")
# runs the correlation for each gene between gene_exp and act_gene_exp
new_row_all<-c(g, pear_coef_all)
pear_cor_all_df<-rbind(pear_cor_all_df, new_row_all)
#saves this to the df
}
But its not giving me the correct results.
This is data for act_count:
dput(act_counts[1:10, 1:10])
structure(list(gene = c("ENSG00000152931.6", "ENSG00000183696.9",
"ENSG00000139269.2", "ENSG00000169129.8", "ENSG00000134602.11",
"ENSG00000136237.12", "ENSG00000259425.1", "ENSG00000242284.2",
"ENSG00000235027.1", "ENSG00000228169.3"), Gene_Sy = c("ENSG00000152931.6",
"ENSG00000183696.9", "ENSG00000139269.2", "ENSG00000169129.8",
"ENSG00000134602.11", "ENSG00000136237.12", "ENSG00000259425.1",
"ENSG00000242284.2", "ENSG00000235027.1", "ENSG00000228169.3"
), Chr = c("5", "7", "12", "10", "X", "7", "15", "X", "11", "10"
), Coord = c(59783540, 48128225, 57846106, 116164515, 131157293,
22396763, 23096869, 134953994, 1781578, 116450393), HG00096 = c(0.101857770468582,
8.1838049456063, 1.19991028786682, 0.831939826228749, 27.6464223725999,
3.78850273139249, 0.0540590649819536, 0.351716382898523, 0.200791414339667,
96.1821778045089), HG00097 = c(0.0781095249582053, 5.68691050653862,
1.57357169691446, 0.0697777450667378, 24.3955715036476, 2.05096276937706,
0.112185357489692, 0.444540251941709, 0.190137938062251, 101.17926156721
), HG00099 = c(0.0489806714207954, 2.43465332606958, 0.521615781673147,
0.93108575037257, 16.4453735152148, 4.00031300285966, 0.00359181983091798,
0.227707651999832, 0.0929246302159905, 58.7830634918037), HG00100 = c(0.118597118618172,
3.83089421985197, 1.44722544015787, 0.620940765480242, 24.8066495438254,
3.27161920134705, 0.00049968321150251, 0.714112406249513, 0.108789749488722,
105.483527339859), HG00101 = c(0.00403496367614745, 6.61228835251498,
3.56579072437701, 1.66066836204679, 25.1133488775017, 1.79821591847768,
0.0293976115522442, 0.450911709524112, 0.23244822901371, 105.818192023699
), HG00102 = c(0.0109253485646219, 4.70964559086586, 1.98268073472144,
0.570481056180073, 19.2339882617972, 1.51668840574531, 0.0312661751488703,
0.491437808951175, 0.250905117203001, 136.140843495464)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
This is prd_counts:
dput(prd_counts[1:10, 1:10])
structure(list(FID = c("HG00096", "HG00097", "HG00099", "HG00100",
"HG00101", "HG00102", "HG00103", "HG00105", "HG00106", "HG00107"
), IID = c("HG00096", "HG00097", "HG00099", "HG00100", "HG00101",
"HG00102", "HG00103", "HG00105", "HG00106", "HG00107"), ENSG00000182902.8 = c(0.0223611610092831,
0.0385031316687293, -0.0682504384265577, 0.00018098416274239,
-0.045492721345375, -0.10473163051734, -0.0215970711860838, 0.060455638944161,
-0.00889260689717109, -0.102096211855105), ENSG00000183307.3 = c(0.129041336028238,
-0.13226906002202, 0.005409246530295, -0.0539556427088601, -0.00699884042001628,
-0.204743560777908, -0.0534359750800079, -0.235648260835705,
-0.10230402771496, -0.0914043464852205), ENSG00000237438.1 = c(-0.758838434524167,
-0.579236418964912, -0.695762357174973, -0.368416879945024, -0.339555280234214,
-0.809438763600528, -0.359798980325098, -0.417769387016999, -0.724636782037491,
-0.309671271758401), ENSG00000243156.2 = c(-0.58456094489168,
0.105851861253113, -0.275061563982305, -0.0406543077034047, -0.522672785138957,
-0.126100301787985, -0.288382571274346, -0.354309857822533, -0.314842662063296,
-0.141401921597711), ENSG00000099968.13 = c(0.135357355615122,
0.157616292043257, 0.180059097593111, 0.250009792099489, 0.170653230854707,
0.316157576642492, 0.314671674077333, 0.224102148083679, 0.232969333848649,
0.14963210689311), ENSG00000069998.8 = c(-0.0346986034383362,
-0.0173493017191681, 0, -0.0173493017191681, -0.645266014640116,
-0.0346986034383362, -0.0173493017191681, -0.0173493017191681,
-0.0346986034383362, 0), ENSG00000184979.8 = c(-0.160573318589815,
0.54683218159596, 0.3503062647549, 0.653899917577768, 0.321280544783323,
0.653727041876318, 0.822864620159811, 1.03780221621802, -0.195295753744408,
-0.228590172992798), ENSG00000070413.12 = c(0.775225873145799,
0.602092262450708, 1.0198591935485, 0.65587457098494, 0.306445027670957,
0.581202299884586, 0.836112660742631, 0.559373823767867, 0.46977171007116,
0.84426113999649)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
The provided test samples will not work because there are no genes in common between act_counts and prd_counts. I took the liberty of fixing that by reassigning column names:
library(dplyr)
library(tidyr)
## the line below fixes the problem with test samples
colnames(prd_counts)[3:10] <- act_counts$gene[1:8]
acts <- pivot_longer(act_counts,
cols = starts_with("HG"),
names_to = "FID",
values_to = "Actual")
prds <- pivot_longer(prd_counts,
cols = starts_with("ENSG"),
names_to = "gene",
values_to = "Predicted")
inner_join(acts, prds,
by = c("gene", "FID")) |>
select(gene, FID, Actual, Predicted) |>
group_by(gene) |>
summarize(rho = cor(Actual, Predicted))
##> # A tibble: 8 × 2
##> gene rho
##> <chr> <dbl>
##> 1 ENSG00000134602.11 -0.445
##> 2 ENSG00000136237.12 0.446
##> 3 ENSG00000139269.2 0.543
##> 4 ENSG00000152931.6 0.770
##> 5 ENSG00000169129.8 -0.802
##> 6 ENSG00000183696.9 0.405
##> 7 ENSG00000242284.2 -0.503
##> 8 ENSG00000259425.1 -0.110
I am trying to create a map where I show the amount and category of Exports in every European country, using a scatterpie plot. This is the data I am trying to represent:
Country A B C D E F G Total
1 FR 48208727011 129696846358 34574295963 99154544367 87056475894 104059261659 391086898 50.3141238
2 BE 30008344795 130642251666 27315419464 48966420544 51351672841 57686707705 875915760 34.6846733
3 NL 53815652300 126965690773 52604259051 164935573324 43089183110 79607329056 516212340 52.1533900
4 DE 79643366705 285793555191 66579801287 230961697801 160598853461 167790359814 13590821673 100.4958456
5 IT 35306881277 124880125091 31042897909 65051137874 44481779280 65707113992 307508636 36.6777444
6 UK 4190569134 14226329164 4343541388 8299777138 7863823675 8191378024 177728913 4.7293147
7 IE 8049979989 25547263228 3324685081 15609577840 18293778082 13299495081 284077060 8.4408856
8 DK 10844794488 22366273732 3669934507 20904821209 8871184551 17364886109 1104100358 8.5125995
9 EL 5548998459 14199041489 9684405892 6969942717 2877634605 8740624663 9513713 4.8030162
10 PT 9302893141 19921174761 5742487970 12183620710 9794141959 10889202370 59025653 6.7892547
11 ES 29087706350 79136960848 26777114009 45807156391 43316950993 54577475375 225619825 27.8928984
12 LU 2103037221 5485541709 1274451840 3165573258 3448812873 2685200517 23828895 1.8186446
13 SE 14297019504 32367817406 10023929115 31082425639 18504243058 21520786963 251825497 12.8048047
14 FI 4368941438 17924135085 6424290821 13268574752 7679357024 7759601514 87932852 5.7512833
15 AT 11108739001 47969735941 8282060600 36180768764 20761302493 26060191499 319396555 15.0682195
16 MT 529547453 748570490 789405002 772157398 939286493 808546088 1179489 0.4588692
17 EE 1387220092 4797469841 1253135597 3127037067 1483571375 2251847940 315884341 1.4616166
18 LV 2714038229 4237027490 958962478 3158721396 1479290893 2931423023 89667330 1.5569131
19 LT 3408636288 8761053696 3263941940 5534705815 2630113004 4477086678 348351748 2.8423889
20 PL 17264039729 70678231411 11723435712 53284056901 28214023352 41438947683 319384835 22.2922120
21 CZ 7664643659 38573705210 5359209173 54059163460 20745595183 22423687496 216009863 14.9042014
22 SK 4193310193 17229538594 3771900263 19251595573 18415022178 10092362707 163300267 7.3117030
23 HU 5067726212 26282833327 5807291521 31406620462 16576651093 12918544146 456905984 9.8516573
24 RO 7210065674 24768518425 3986448288 20279628790 10274528929 13490373296 213856837 8.0223420
25 BG 3364866564 11098005470 2490021719 5767532283 2282959524 4540599434 289425842 2.9833411
26 SI 2226481542 11769625979 2186097710 5986840366 6169533307 8453642146 32927930 3.6825149
27 HR 2664219116 7204053277 2281750708 4155735739 2094082503 4970586651 14826478 2.3385254
28 CY 847756088 1467939342 983937418 824244195 1900124484 1375465594 47109886 0.7446577
Using the following code:
library(giscoR)
borders <- gisco_get_countries(
epsg = "3035",
year = "2020",
resolution = "3",
country = idf$Country
)
merged <- merge(borders,
idf,
by.x = "CNTR_ID",
by.y = "Country",
all.x = TRUE
)
library(tidyverse)
symbol_pos <- st_centroid(merged, of_largest_polygon = TRUE)
separate_coords = symbol_pos %>% mutate(lat = unlist(map(symbol_pos$geometry, 1)), long = unlist(map(symbol_pos$geometry, 2)))
sympos = data.frame(Country = separate_coords$CNTR_ID, lat = separate_coords$lat, long = separate_coords$long)
merged <- merge(merged,
sympos,
by.x = "CNTR_ID",
by.y = "Country",
all.x = TRUE
)
ggplot() +
geom_sf(data = merged, size = 0.1) +
geom_scatterpie(data = merged, aes(x = long, y = lat, r = Total), cols = LETTERS[1:7])+
coord_sf(xlim = c(2377294, 6500000), ylim = c(1413597, 5228510))
And it gives me this error:
Error in rowSums(data[, cols]) : 'x' must be numeric
I am trying to create a map similar to this one:
And I would be grateful if someone can provide some hint as to how to fix the error. Thanks.
Edit: below is the dput(idf) output:
structure(list(Country = c("FR", "BE", "NL", "DE", "IT", "UK",
"IE", "DK", "EL", "PT", "ES", "LU", "SE", "FI", "AT", "MT", "EE",
"LV", "LT", "PL", "CZ", "SK", "HU", "RO", "BG", "SI", "HR", "CY"
), A = c(48208727011, 30008344795, 53815652300, 79643366705,
35306881277, 4190569134, 8049979989, 10844794488, 5548998459,
9302893141, 29087706350, 2103037221, 14297019504, 4368941438,
11108739001, 529547453, 1387220092, 2714038229, 3408636288,
17264039729,
7664643659, 4193310193, 5067726212, 7210065674, 3364866564,
2226481542,
2664219116, 847756088), B = c(129696846358, 130642251666,
126965690773,
285793555191, 124880125091, 14226329164, 25547263228,
22366273732,
14199041489, 19921174761, 79136960848, 5485541709, 32367817406,
17924135085, 47969735941, 748570490, 4797469841, 4237027490,
8761053696, 70678231411, 38573705210, 17229538594, 26282833327,
24768518425, 11098005470, 11769625979, 7204053277, 1467939342
), C = c(34574295963, 27315419464, 52604259051, 66579801287,
31042897909, 4343541388, 3324685081, 3669934507, 9684405892,
5742487970, 26777114009, 1274451840, 10023929115, 6424290821,
8282060600, 789405002, 1253135597, 958962478, 3263941940,
11723435712,
5359209173, 3771900263, 5807291521, 3986448288, 2490021719,
2186097710,
2281750708, 983937418), D = c(99154544367, 48966420544,
164935573324,
230961697801, 65051137874, 8299777138, 15609577840, 20904821209,
6969942717, 12183620710, 45807156391, 3165573258, 31082425639,
13268574752, 36180768764, 772157398, 3127037067, 3158721396,
5534705815, 53284056901, 54059163460, 19251595573, 31406620462,
20279628790, 5767532283, 5986840366, 4155735739, 824244195),
E = c(87056475894, 51351672841, 43089183110, 160598853461,
44481779280, 7863823675, 18293778082, 8871184551, 2877634605,
9794141959, 43316950993, 3448812873, 18504243058, 7679357024,
20761302493, 939286493, 1483571375, 1479290893, 2630113004,
28214023352, 20745595183, 18415022178, 16576651093, 10274528929,
2282959524, 6169533307, 2094082503, 1900124484), F =
c(104059261659,
57686707705, 79607329056, 167790359814, 65707113992, 8191378024,
13299495081, 17364886109, 8740624663, 10889202370, 54577475375,
2685200517, 21520786963, 7759601514, 26060191499, 808546088,
2251847940, 2931423023, 4477086678, 41438947683, 22423687496,
10092362707, 12918544146, 13490373296, 4540599434, 8453642146,
4970586651, 1375465594), G = c(391086898, 875915760, 516212340,
13590821673, 307508636, 177728913, 284077060, 1104100358,
9513713, 59025653, 225619825, 23828895, 251825497, 87932852,
319396555, 1179489, 315884341, 89667330, 348351748, 319384835,
216009863, 163300267, 456905984, 213856837, 289425842, 32927930,
14826478, 47109886), Total = c(50.314123815, 34.6846732775,
52.1533899954, 100.4958455932, 36.6777444059, 4.7293147436,
8.4408856361, 8.5125994954, 4.8030161538, 6.7892546564,
27.8928983791,
1.8186446313, 12.8048047182, 5.7512833486, 15.0682194853,
0.4588692413, 1.4616166253, 1.5569130839, 2.8423889169,
22.2922119623,
14.9042014044, 7.3117029775, 9.8516572745, 8.0223420239,
2.9833410836, 3.682514898, 2.3385254472, 0.7446577007)),
row.names = c(NA,
-28L), class = "data.frame")
Please find below one possible solution to your request. The main problem was that geom_scatterpie() expects a dataframe and not an sf object. So you need to use as.data.frame() inside geom_scatterpie(). I also took the opportunity to simplify your code a bit.
Reprex
Code
library(giscoR)
library(sf)
library(dplyr)
library(ggplot2)
library(scatterpie)
borders <- gisco_get_countries(
epsg = "3035",
year = "2020",
resolution = "3",
country = idf$Country
)
merged <- merge(borders,
idf,
by.x = "CNTR_ID",
by.y = "Country",
all.x = TRUE
)
symbol_pos <- st_centroid(merged, of_largest_polygon = TRUE)
sympos <- symbol_pos %>%
st_drop_geometry() %>%
as.data.frame() %>%
cbind(., symbol_pos %>% st_coordinates()) %>%
select(CNTR_ID, X, Y) %>%
rename(Country = CNTR_ID, long = X, lat = Y)
merged <- merge(merged,
sympos,
by.x = "CNTR_ID",
by.y = "Country",
all.x = TRUE
)
Visualization
ggplot() +
geom_sf(data = merged, size = 0.1) +
geom_scatterpie(data = as.data.frame(merged), aes(x = long, y = lat, r = Total*2200), cols = LETTERS[1:7]) +
coord_sf(xlim = c(2377294, 6500000), ylim = c(1413597, 5228510))
Created on 2022-01-23 by the reprex package (v2.0.1)
I am trying to reorder the following graph based on the rank of the lowest confidence interval (conf.low). This means that Austria (AU) should be the first country, Bulgaria (BG) the second and Belgium (BE) the third. I know there is a way to do it manually by choosing the order of the country variable but i prefer to find a way to do it automatically since i have 30 countries. Could someone help?
Here is the data and the code:
df= structure(list(cntry = structure(1:3, .Label = c("AU", "BE",
"BG"), class = "factor"), estimate = c(0.0053, 0.01740,
0.0036), conf.low = c(-0.0257, 0.0005,
-0.0006), conf.high = c(0.0365, 0.0343,
0.0079)), row.names = c(NA, -3L), class = "data.frame")
df %>%
arrange(estimate) %>%
mutate(label = replace(round(estimate, 3),cntry==1, '')) %>%
ggplot(aes(estimate, cntry,label=label)) +
geom_point()+
geom_text(vjust= -1) +
geom_linerange(mapping=aes(xmin=conf.low , xmax=conf.high, y=cntry)) +
geom_point(mapping=aes(x=estimate, y=cntry))
Using forcats::fct_reorder() you could do this:
library(dplyr)
library(ggplot2)
library(forcats)
df %>%
arrange(estimate) %>%
mutate(label = replace(round(estimate, 3), cntry==1, '')) %>%
ggplot(aes(estimate, fct_reorder(cntry, conf.low, .desc = TRUE),label=label)) +
geom_point()+
geom_text(vjust= -1) +
geom_linerange(mapping=aes(xmin=conf.low , xmax=conf.high, y=cntry)) +
geom_point(mapping=aes(x=estimate, y=cntry))+
ylab("Country")
Created on 2021-04-22 by the reprex package (v2.0.0)
data
df= structure(list(cntry = structure(1:3, .Label = c("AU", "BE",
"BG"), class = "factor"), estimate = c(0.0053, 0.01740,
0.0036), conf.low = c(-0.0257, 0.0005,
-0.0006), conf.high = c(0.0365, 0.0343,
0.0079)), row.names = c(NA, -3L), class = "data.frame")
I am trying to make a shiny app where you can select different miRNA in my input then plot the survival curve using ggsurvplot. There is something wrong with the functions within fitSurv, but I am not sure where I am doing it wrong.
library(dplyr)
require(survminer)
library(tidyverse)
require(reshape2)
library(shiny)
library(tidyr)
require(survival)
example data:
df.miRNA.cpm <- structure(list(`86` = c(5.57979757386892, 17.0240095264258, 4.28380151026145,
13.0457611762755, 12.5531123449841), `175` = c(5.21619202802748,
15.2849097474841, 2.46719979911461, 10.879496005461, 9.66416497290915
), `217` = c(5.42796072966512, 17.1413407297933, 5.15230233060323,
12.2646127361351, 12.1031024927547), `394` = c(-1.1390337316217,
15.1021660424984, 4.63168157763046, 11.1299079134792, 9.55572588729967
), `444` = c(5.06134249676025, 14.5442494311861, -0.399445049232868,
7.45775961504073, 9.92629675808998)), row.names = c("hsa_let_7a_3p",
"hsa_let_7a_5p", "hsa_let_7b_3p", "hsa_let_7b_5p", "hsa_let_7c_5p"
), class = "data.frame")
df.miRNA.cpm$miRNA <- rownames(df.miRNA.cpm)
ss.survival.shiny.miRNA.miRNA <- structure(list(ID = c("86", "175", "217", "394", "444"), TimeDiff = c(71.0416666666667,
601.958333333333, 1130, 1393, 117.041666666667), Status = c(1L,
1L, 0L, 0L, 1L)), row.names = c(NA, 5L), class = "data.frame")
Joint the two example data frames:
data_prep.miRNA <- df.miRNA.cpm %>%
tidyr::pivot_longer(-miRNA, names_to = "ID") %>%
left_join(ss.survival.shiny.miRNA.miRNA)
Example of the joined data:
> data_prep.miRNA
# A tibble: 153,033 x 5
miRNA ID value TimeDiff Status
<chr> <chr> <dbl> <dbl> <int>
1 hsa_let_7a_3p 86 5.58 71.0 1
2 hsa_let_7a_3p 175 5.22 602. 1
3 hsa_let_7a_3p 217 5.43 1130 0
4 hsa_let_7a_3p 394 -1.14 1393 0
5 hsa_let_7a_3p 444 5.06 117. 1
6 hsa_let_7a_3p 618 4.37 1508 0
7 hsa_let_7a_3p 640 2.46 1409 0
8 hsa_let_7a_3p 829 0.435 919. 0
9 hsa_let_7a_3p 851 -1.36 976. 0
10 hsa_let_7a_3p 998 3.87 1196. 0
# … with 153,023 more rows
For a selected MicroRNA this works:
fitSurv <- survfit(Surv(data$TimeDiff, data$Status) ~ paste(cut(value , quantile(value , probs = c(0, 0.8)), include.lowest=T)), data = data_prep.miRNA[grep("hsa_let_7a_3p",data_prep.miRNA$miRNA),])
Shiny:
ui.miRNA <- fluidPage(
selectInput("MicroRNA", "miRNA", choices = unique(data_prep.miRNA$miRNA)),
plotOutput("myplot"))
server <- function(input, output, session) {
data_selected <- reactive({
filter(data_prep.miRNA, miRNA %in% input$MicroRNA)
})
output$myplot <- renderPlot({
fitSurv <- survfit(Surv("TimeDiff", "Status") ~ paste(cut("value" , quantile("value" , probs = c(0, 0.8)), include.lowest=T)), data = data_selected)
ggsurvplot(fitSurv ,title="", xlab="Time (Yrs)", ylab="Survival prbability",
font.main = 8,
font.x = 8,
font.y = 8,
font.tickslab = 8,
font.legend=8,
pval.size = 3,
pval.coord = c(1000,1),
size=0.4,
legend = "right",
censor.size=2,
break.time.by = 365,
pval =T,#"p=0.003",#"p=0.41",
#xscale=365,
#palette = c("#E7B800", "#2E9FDF"),
#ggtheme = theme_bw(),
risk.table = F,
xscale=365.25,
xlim=c(0,7*365))
})
}
shinyApp(ui.miRNA, server)
There are several mistakes in this statement:
fitSurv <-
survfit(Surv("TimeDiff", "Status") ~ paste(cut("value", quantile("value", probs = c(0, 0.8)), include.lowest=T)),
data = data_selected)
First, data_selected is a reactive conductor, not a dataframe. If you want the dataframe returned by this reactive conductor, you have to use parentheses: data_selected().
Next, you must not quote the variables: TimeDiff and not "TimeDiff", etc.
The paste command is useless.
Your cut produces only one category and the NA category. To get two intervals as categories, use probs = c(0, 0.8, 1) in quantile.
Finally it is not a good idea to use T for TRUE, because T can be set to any R object, while TRUE is a reserved work.
To conclude, here is the corrected code:
fitSurv <-
survfit(Surv(TimeDiff, Status) ~ cut(value, quantile(value, probs = c(0, 0.8, 1)), include.lowest=TRUE),
data = data_selected())
I'm trying to create a long tibble dataframe with a date sequence. Now I tried to use this example. The example works but not when I try to implement to my own data. It gives an error message: Error in seq.int(0, to0 - from, by) : wrong sign in 'by' argument. At can't figure out why the code on my tibble throws an error... All help much appreciated.
This example works:
library(tidyverse)
example <- structure(list(idnum = c(17L, 17L, 17L), start = structure(c(8401,
8401, 8401), class = "Date"), end = structure(c(8765, 8765, 8765
), class = "Date")), class = "data.frame", .Names = c("idnum",
"start", "end"), row.names = c(NA, -3L))
example %>%
as.tibble() %>%
nest(start, end) %>% view
mutate(data = map(data, ~seq(unique(.x$start), unique(.x$end), 1))) %>%
unnest(data)
That's kind of what I'm looking for.
The code on my data gives an error message.
df <- structure(list(nieuw = c("Nieuw", "Nieuw", "Nieuw"), jaar = c(NA,
2013, 2014), aow_jaar = c("65", "65", "65"), aow_maanden = c(NA,
"1", "2"), vanaf = structure(c(-8036, -8036, -7701), class = "Date"),
tot_en_met = structure(c(-8037, -7702, -7367), class = "Date")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -3L))
df %>%
nest(vanaf, tot_en_met) %>%
mutate(data = map(data, ~seq(unique(.x$vanaf), unique(.x$tot_en_met), 1))) %>%
unnest(data)
Error in seq.int(0, to0 - from, by) : wrong sign in 'by' argument
The error message say it has to do with the by = argument but I can't understand why it's not working...
Here, the issue is that one of the rows (1st row) end date is lower than the start date. An option is to check the min/max and then do seq
library(dplyr)
library(purrr)
df %>%
mutate(out = map2(vanaf, tot_en_met,
~ seq(min(.x, .y), max(.x, .y), by = 1))) # %>%
# unnest # if needed
# A tibble: 3 x 7
# nieuw jaar aow_jaar aow_maanden vanaf tot_en_met out
# <chr> <dbl> <chr> <chr> <date> <date> <list>
#1 Nieuw NA 65 <NA> 1948-01-01 1947-12-31 <date [2]>
#2 Nieuw 2013 65 1 1948-01-01 1948-11-30 <date [335]>
#3 Nieuw 2014 65 2 1948-12-01 1949-10-31 <date [335]>
Also, instead of doing min/max in each row, we can do this in a vectorized way with pmin/pmax
df %>%
mutate(out = map2(pmin(vanaf, tot_en_met),
pmax(vanaf, tot_en_met), seq, by = 1))