I have a dataframe of over 200 variables, many of which end with a code for a given species. I want to eliminate any columns that contain one of several codes, contained in a separate vector of character strings. How can I remove these multiple columns matching the multiple codes at the same time? The column names don't match the code values exactly, but contain the codes at the end of the column name. For example:
"rev230" "rev3360" "rev3508"
Manually, I've done this (using the dplyr package):
sub = select(df, -contains("3781"), -contains("3751"), -contains("1408"),
-contains("1409"), -contains("4469"), -contains("1789"), -contains("4559"),
-contains("1453"),-contains("8"), -contains("3508"), -contains("4656"),
-contains("5131"), -contains("9999"))
This gets me what I want (eliminating all columns that contain data on the species matching these codes), but obviously this is very tedious.
I'd like something more like:
sub = select(df, -contains(species$codes))
# I realize this isn't the right syntax
I tried a loop to remove individual columns, using something like this:
foreach(i=1:length(species$codes), .combine=rbind)%do%
select(df, -contains(species$codes[i]))
but that didn't work either. Thanks in advance!
reproducible example:
Species codes (contained in a character vector):
dput(species)
c("3754", "3755", "3758", "3764", "3765", "3771", "3772", "3782",
"3761", "3762", "3763", "3767", "3768", "1790", "1412", "1413",
"1416", "1422", "1423", "1424", "1425", "1426", "1410", "1411",
"1414", "1415", "1420", "3770", "4740", "4470", "4472", "4474",
"4476", "4479", "4480", "1812", "1815", "1799", "4560", "3810",
"1440", "1441", "3302", "3295", "3560", "3360", "1940", "3840",
"570", "1050", "4710", "230")
Here are the first 10 rows of the data, with only columns for species data
dput(logsub)
structure(list(lbs3781 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 708), lbs3764 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3765 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 708), lbs3758 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3755 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3782 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), lbs3751 = c(0, 0, 4, 0, 0, 0, 21, 0, 18, 0), lbs3761 = c(0,
0, 0, 0, 0, 0, 18, 0, 0, 0), lbs3762 = c(0, 0, 4, 0, 0, 0, 3,
0, 0, 0), lbs3763 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3767 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3768 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), lbs3754 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3771 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3772 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), lbs1790 = c(0, 0, 0, 0, 0, 0, 0, 0, 18, 0), lbs1409 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 86), lbs1411 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), lbs1414 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs1415 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 86), lbs4740 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), lbs1420 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3770 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs1408 = c(2508, 785, 57, 0, 132,
5003, 18, 104, 636, 0), lbs1412 = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0), lbs1413 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs1416 = c(2331,
654, 57, 0, 81, 4284, 15, 104, 120, 0), lbs1422 = c(177, 0, 0,
0, 51, 719, 3, 0, 0, 0), lbs1423 = c(0, 131, 0, 0, 0, 0, 0, 0,
502, 0), lbs1424 = c(0, 0, 0, 0, 0, 0, 0, 0, 14, 0), lbs1425 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs1426 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), lbs1410 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs4469 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs4470 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), lbs4472 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
lbs4474 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs4476 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), lbs4479 = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), lbs4480 = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0), lbs1789 = c(0, 0, 0, 863, 0, 0, 0, 0,
0, 98), lbs1812 = c(0, 0, 0, 863, 0, 0, 0, 0, 0, 27), lbs1815 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 71), lbs1799 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), lbs4559 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 12),
lbs4560 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 12), lbs3810 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), lbs1453 = c(0, 0, 5,
0, 0, 0, 21, 0, 15, 235), lbs1440 = c(0, 0, 5, 0, 0, 0, 21,
0, 15, 0), lbs1441 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3560 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3302 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 235), lbs3295 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
lbs0008 = c(0, 97, 99, 0, 0, 0, 0, 0, 0, 0), lbs1940 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3840 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), lbs1050 = c(0, 0, 31, 0, 0, 0, 0, 0, 0, 0),
lbs4710 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs570 = c(0,
97, 68, 0, 0, 0, 0, 0, 0, 0), lbs230 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), lbs3360 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), lbs3508 = c(0,
0, 5043, 0, 0, 0, 0, 0, 0, 0), lbs4656 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0), lbs9999 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
rev3781 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 1688.144979), rev3764 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), rev3765 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 1688.144979), rev3758 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0), rev3755 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev3782 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), rev3751 = c(0, 0, 7.063636364,
0, 0, 0, 33.44605263, 0, 32.53608247, 0), rev3761 = c(0,
0, 0, 0, 0, 0, 27.34105263, 0, 0, 0), rev3762 = c(0, 0, 7.063636364,
0, 0, 0, 6.105, 0, 0, 0), rev3763 = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), rev3767 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev3768 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), rev3754 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), rev3771 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev3772 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), rev1790 = c(0, 0, 0, 0, 0, 0,
0, 0, 32.53608247, 0), rev1409 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 260.0068669), rev1411 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0
), rev1414 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev1415 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 260.0068669), rev4740 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0), rev1420 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0), rev3770 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev1408 = c(6349.327025,
2014.2837, 142.8362084, 0, 339.5618788, 13265.98305, 41.94345809,
235.6862428, 1835.487932, 0), rev1412 = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0), rev1413 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
rev1416 = c(5841.249152, 1623.155767, 142.8362084, 0, 194.2835976,
11101.38378, 33.99320809, 235.6862428, 299.2968186, 0), rev1422 = c(508.0778723,
0, 0, 0, 145.2782813, 2164.599274, 7.95025, 0, 0, 0), rev1423 = c(0,
391.1279328, 0, 0, 0, 0, 0, 0, 1494.676782, 0), rev1424 = c(0,
0, 0, 0, 0, 0, 0, 0, 41.51433134, 0), rev1425 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0), rev1426 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0), rev1410 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev4469 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), rev4470 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), rev4472 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev4474 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), rev4476 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), rev4479 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), rev4480 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev1789 = c(0,
0, 0, 963.8520574, 0, 0, 0, 0, 0, 95.34540063), rev1812 = c(0,
0, 0, 963.8520574, 0, 0, 0, 0, 0, 30.02711217), rev1815 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 65.31828847), rev1799 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0), rev4559 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 12.94965112), rev4560 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 12.94965112
), rev3810 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev1453 = c(0,
0, 3.505617978, 0, 0, 0, 13.9460241, 0, 10.93726937, 225.778089
), rev1440 = c(0, 0, 3.505617978, 0, 0, 0, 13.9460241, 0,
10.93726937, 0), rev1441 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0
), rev3560 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev3302 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 225.778089), rev3295 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0), rev0008 = c(0, 180.3441341, 169.7750491,
0, 0, 0, 0, 0, 0, 0), rev1940 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0), rev3840 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev1050 = c(0,
0, 48.71428571, 0, 0, 0, 0, 0, 0, 0), rev4710 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0), rev570 = c(0, 180.3441341, 121.0607634,
0, 0, 0, 0, 0, 0, 0), rev230 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0), rev3360 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), rev3508 = c(0,
0, 2620.957866, 0, 0, 0, 0, 0, 0, 0), rev4656 = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0), rev9999 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0)), .Names = c("lbs3781", "lbs3764", "lbs3765", "lbs3758",
"lbs3755", "lbs3782", "lbs3751", "lbs3761", "lbs3762", "lbs3763",
"lbs3767", "lbs3768", "lbs3754", "lbs3771", "lbs3772", "lbs1790",
"lbs1409", "lbs1411", "lbs1414", "lbs1415", "lbs4740", "lbs1420",
"lbs3770", "lbs1408", "lbs1412", "lbs1413", "lbs1416", "lbs1422",
"lbs1423", "lbs1424", "lbs1425", "lbs1426", "lbs1410", "lbs4469",
"lbs4470", "lbs4472", "lbs4474", "lbs4476", "lbs4479", "lbs4480",
"lbs1789", "lbs1812", "lbs1815", "lbs1799", "lbs4559", "lbs4560",
"lbs3810", "lbs1453", "lbs1440", "lbs1441", "lbs3560", "lbs3302",
"lbs3295", "lbs0008", "lbs1940", "lbs3840", "lbs1050", "lbs4710",
"lbs570", "lbs230", "lbs3360", "lbs3508", "lbs4656", "lbs9999",
"rev3781", "rev3764", "rev3765", "rev3758", "rev3755", "rev3782",
"rev3751", "rev3761", "rev3762", "rev3763", "rev3767", "rev3768",
"rev3754", "rev3771", "rev3772", "rev1790", "rev1409", "rev1411",
"rev1414", "rev1415", "rev4740", "rev1420", "rev3770", "rev1408",
"rev1412", "rev1413", "rev1416", "rev1422", "rev1423", "rev1424",
"rev1425", "rev1426", "rev1410", "rev4469", "rev4470", "rev4472",
"rev4474", "rev4476", "rev4479", "rev4480", "rev1789", "rev1812",
"rev1815", "rev1799", "rev4559", "rev4560", "rev3810", "rev1453",
"rev1440", "rev1441", "rev3560", "rev3302", "rev3295", "rev0008",
"rev1940", "rev3840", "rev1050", "rev4710", "rev570", "rev230",
"rev3360", "rev3508", "rev4656", "rev9999"), row.names = c(34367L,
48646L, 48715L, 48717L, 48722L, 48724L, 48743L, 48744L, 48781L,
48783L), class = "data.frame")
One options is select_if with grepl, as grepl returns a logical vector and you can include multiple patterns you want to match on via |.
A simple case where you want to remove two species would look like:
select_if(df, !grepl("3781|3751", names(df)) )
Aggregate all of your species into the right form for grepl via paste with collapse, and then use this within grepl.
select_if(df, !grepl(paste(species, collapse = "|"), names(df)) )
Related
I have fish stomach contents/diet data and I would like to get presence/absence information for each taxa in my df by month. Each observation (row) has information on the taxa absent (== 0) or present (== 1) in each fish's stomach. I have already transformed my original data to presence/absence values, however, I am not sure how to obtain a summary of what taxa was present or absent by month.
structure(list(id = c("607_6", "808_4", "801_3", "807_11", "801_16",
"724_13", "1030_40", "723_78", "701_4", "634_2", "1023_2", "1031_2",
"643_4", "606_3", "723_79", "801_4", "629_4", "642_10", "801_10",
"801_11", "1001_35", "616_4", "701_9", "627_2", "601_5"), Daphnia = c(0,
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), Byths = c(0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1), Chiro.Pupae = c(0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0
), Empty = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Chiro.Larvae = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), Amphipod = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0), Isopod = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Chironomidae = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Hemimysis = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0), Copepoda = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0), Sphaeriidae = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), Chiro.Adult = c(0, 0, 0, 0, 0, 0, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Trichopteran = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 0, 0), UID.Fish = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Chydoridae = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), Cyclopoid = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Fish.Eggs = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), EggMass = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Dreissena = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), Goby = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Eurycercidae = c(0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0), Hirudinea = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), totalnumPrey = c(0,
5, 0, 0, 102, 7, 220, 45, 0, 0, 0, 25, 116, 49, 119, 0, 7,
5, 0, 0, 0, 595, 105, 58, 20), MONTH = c(6L, 8L, 8L, 8L,
8L, 7L, 11L, 7L, 7L, 6L, 11L, 11L, 6L, 6L, 7L, 8L, 6L, 6L,
8L, 8L, 11L, 6L, 7L, 6L, 6L), empty = c("Empty", "Not_empty",
"Empty", "Empty", "Not_empty", "Not_empty", "Not_empty",
"Not_empty", "Empty", "Empty", "Empty", "Not_empty", "Not_empty",
"Not_empty", "Not_empty", "Empty", "Not_empty", "Not_empty",
"Empty", "Empty", "Empty", "Not_empty", "Not_empty", "Not_empty",
"Not_empty")), row.names = c(NA, -25L), class = c("data.table",
"data.frame"))
I looked online and various SO posts like this one, but I am not getting exactly what I need.
I would like to end up with something like this (or similar) for each month for all taxa in my df (the values here are made up, might not reflect the real data):
Month
Daphnia
Byths
Chiro.Pupae
Isopod
Goby
11
1
1
0
1
0
My ultimate goal is to make a bar plot in ggplot that looks like this:
Originally, the data was in long format but this results in multiple rows per fish. I changed to wide format to end up with one observation(row) per fish.
How can I achieve this to ultimately plot presence/absence by month? Thank you!
Maybe you want something like with your selected column converted to a longer format. After that to show the zeros bars, you can say that there is a bar by giving it a small negative number (if you want to show zero bars). At last, the y-axis has a binary format. You can use the following code:
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
df %>%
select(MONTH, Daphnia, Byths, Chiro.Pupae, Isopod, Goby) %>%
mutate(MONTH = month.name[MONTH]) %>%
pivot_longer(cols = -c(MONTH), values_transform = as.numeric) %>%
ggplot(aes(x = MONTH, y = sapply(value, FUN=function(x) ifelse(x==0,-0.1,x)), fill = name)) +
geom_bar(position = "dodge", stat = "identity") +
scale_y_continuous(breaks = c(0,1)) +
labs(y = "Absence", x = "Month")
Created on 2022-07-30 by the reprex package (v2.0.1)
I need to reshape (I assume it's some sort of reshape like I would do in stata) this dataframe so that there is only 1 observation for each id. In addition, I need to preserve all of the other variables. So, one row should have columns for id, each year which contains the value of var1, x, var2 (var2 is not absolutely necessary. I've tried a lot of different things and I keep getting the same long data for the id variables. I apologize for the long dput() but there would not be information if I only did the first 6 rows.
structure(list(id= c(1806968L, 1806968L, 1806968L, 1806968L,
1806968L, 1806968L, 1806968L, 1806968L, 1806968L, 1806968L, 1806968L,
1806968L, 1806968L, 1806968L, 1806968L, 2022610L, 2022610L, 2022610L,
2022610L, 2022610L), var1 = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 4877, 5819, 6560, 8262, 0, 0, 0, 0, 0), x = c(25518,
25518, 25518, 25518, 25518, 25518, 25518, 25518, 25518, 25518,
25518, 25518, 25518, 25518, 25518, 34611, 34611, 34611, 34611,
34611), var2 = c(200812L, 200912L, 201012L, 201112L, 201212L,
201312L, 201512L, 201612L, 201712L, 201812L, 201912L, 200612L,
200512L, 200712L, 201412L, 199612L, 199712L, 199812L, 199912L,
200012L), `1987` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0), `1988` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), `1989` = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1990` = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1991` = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1992` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1993` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1994` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1995` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1996` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1997` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1998` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1999` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2000` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2001` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2002` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2003` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2004` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2005` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5819, 0, 0, 0, 0, 0, 0, 0),
`2006` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4877, 0, 0, 0,
0, 0, 0, 0, 0), `2007` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 6560, 0, 0, 0, 0, 0, 0), `2008` = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2009` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
`2010` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), `2011` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), `2012` = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2013` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
`2014` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8262,
0, 0, 0, 0, 0), `2015` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2016` = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `2017` = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
`2018` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0), `2019` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), `2020` = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA,
-20L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x55b410de6890>, sorted = c("id",
"var1", "x", "var2"))
I would like it to look something like this, for which the value in each year is the value of var1 that corresponds with that year. Here is a something of what I want without all the years. I still need a column for all years.
id <- c(1806968L, 2022610L)
"1987" <- c(0, 8262)
x <- c(25518, 34611)
data <- data.frame(id, `1987`, x)
It's unclear whether you also want years which are 0 when x is 0. If you do, delete & var1 > 0 below.
library(tidyr)
library(dplyr)
tmp %>%
tidyr::pivot_longer(c(starts_with("1"), starts_with("2")), names_to = "year") %>%
filter(value == var1 & var1 > 0) %>%
select(-value)
I have plotted a barchart showing mortality rates for municipalities(MUN_RESID) for all months between 08-2005 and 12-2015. Since the data frame is so big, the barchart is not very clear, e.g. one cannot read municipality ids on the x-axis and can't see different coloring for the respective months. I would like to have a barchart more clear, so that one can see id's on the x-axis and the coloring of the bar, if possible.
Thank you very much for your help.
Unfortunately, I am new to R and therefore cannot add a screenshot directly of the barchart directly, but there is a link provided after the code. Also, maybe there are some of you with some general advise on how to handle such issues with a barchart. I have to leave all data within one chart, so splitting the data is no option, unfortunately.
This is what the sample looks like:
MUN_RESID X08.2005_P X09.2005_P X10.2005_P X11.2005_P X12.2005_P
1 120043 0.00000000 0.22382438 0.02797805 0.00000000 0.00000000
2 150775 0.00000000 0.02475672 0.00000000 0.00000000 0.00000000
3 170025 0.00000000 0.00000000 0.00000000 0.04305349 0.00000000
4 170382 0.04510756 0.00000000 0.00000000 0.00000000 0.00000000
5 171180 0.00000000 0.04180602 0.00000000 0.00000000 0.00000000
6 171525 0.04113143 0.00000000 0.00000000 0.00000000 0.00000000
7 172025 0.00000000 0.00000000 0.00000000 0.00000000 0.03480216
until 2015
X07.2015_P X08.2015_P X09.2015_P X10.2015_P X11.2015_P X12.2015_P
1 0 0.05595610 0 0 0 0.00000000
2 0 0.00000000 0 0 0 0.02475672
3 0 0.04305349 0 0 0 0.00000000
4 0 0.00000000 0 0 0 0.00000000
5 0 0.00000000 0 0 0 0.00000000
6 0 0.00000000 0 0 0 0.00000000
7 NA NA NA NA NA NA
[ reached 'max' / getOption("max.print") -- omitted 3 rows ]
==X==============================================================X==
Copy+Paste this part. (If on a Mac, it is already copied!)
==X==============================================================X==
months_total_f052 <- structure(list(MUN_RESID = structure(c(1L, 2L, 3L, 4L, 5L, 6L,7L, 171L, 172L, 173L), .Label = c("120043", "150775", "170025","170382", "171180", "171525", "172025", "220080", "220157", "220198","220207", "220360", "220860", "220960", "220975", "221010", "221037","240960", "241380", "241430", "241490", "250073", "250390", "251060","251380", "251520", "251560", "251570", "280500", "280690", "280730","310070", "310310", "310360", "310610", "310700", "310980", "311220","311470", "311620", "312150", "312190", "312460", "312737", "312790","314010", "314130", "314420", "314570", "314660", "314750", "315727","315870", "315970", "316310", "316490", "316590", "316805", "350075","350150", "350730", "350770", "351330", "351385", "351420", "351492","351495", "351610", "351800", "352060", "352540", "352580", "352885","353100", "353320", "353330", "353450", "354030", "354165", "354450","354765", "354830", "355200", "355460", "355520", "355530", "355570","410115", "410185", "410322", "411065", "411230", "411260", "411650","411729", "411740", "411845", "411925", "412030", "412033", "412420","420055", "420208", "420243", "420515", "420535", "420555", "421020","421085", "421315", "421568", "421590", "430045", "430047", "430057","430185", "430215", "430237", "430462", "430495", "430583", "430597","430637", "430786", "430980", "431036", "431041", "431065", "431127","431164", "431173", "431179", "431235", "431261", "431300", "431301","431308", "431310", "431335", "431346", "431407", "431455", "431477","431507", "431514", "431642", "431643", "431805", "431846", "431849","431861", "431935", "431937", "432045", "432235", "432252", "432285","432320", "432360", "500797", "510100", "510120", "510617", "510729","510810", "520120", "520360", "520393", "520710", "521015", "521200","521945", "522157"), class = "factor"), X08.2005_P = c(0, 0,0, 0.0451075641915337, 0, 0.0411314307409985, 0, 0, 0, 0), X09.2005_P = c(0.223824383944905,0.0247567176401135, 0, 0, 0.0418060200668896, 0, 0, 0.0322436628801032,0.0314032417808054, 0), X10.2005_P = c(0.0279780479931131, 0,0, 0, 0, 0, 0, 0, 0, 0), X11.2005_P = c(0, 0, 0.0430534856764365,0, 0, 0, 0, 0, 0, 0), X12.2005_P = c(0, 0, 0, 0, 0, 0, 0.0348021630882904,0, 0, 0.0329205601559928), X01.2006_P = c(0, 0.0247567176401135,0, 0, 0, 0, 0, 0, 0, 0), X02.2006_P = c(0.0279780479931131, 0,0, 0, 0, 0, 0, 0.0322436628801032, 0.0314032417808054, 0.0329205601559928), X03.2006_P = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X04.2006_P = c(0,0, 0, 0, 0.0418060200668896, 0, 0, 0.0322436628801032, 0, 0),X05.2006_P = c(0.0279780479931131, 0, 0, 0, 0, 0, 0, 0, 0.0314032417808054,0), X06.2006_P = c(0, 0, 0.0430534856764365, 0.0451075641915337,0, 0.0411314307409985, 0, 0, 0, 0), X07.2006_P = c(0.0559560959862262,0, 0.0430534856764365, 0, 0, 0, 0, 0, 0, 0), X08.2006_P = c(0.0559560959862262,0, 0, 0, 0, 0, 0, 0.0322436628801032, 0, 0), X09.2006_P = c(0.0559560959862262,0, 0, 0, 0, 0, 0, 0, 0, 0), X10.2006_P = c(0.0279780479931131,0, 0, 0, 0, 0, 0, 0, 0, 0), X11.2006_P = c(0.0559560959862262,0, 0, 0, 0, 0, 0.0696043261765808, 0, 0, 0), X12.2006_P = c(0,0, 0, 0, 0, 0, 0.0348021630882904, 0, 0, 0.0329205601559928), X01.2007_P = c(0, 0, 0, 0.0451075641915337, 0, 0, 0, 0,0, 0), X02.2007_P = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X03.2007_P = c(0,0, 0, 0, 0, 0, 0, 0, 0, 0), X04.2007_P = c(0, 0.0247567176401135,0, 0, 0, 0, 0.0696043261765808, 0, 0, 0), X05.2007_P = c(0.0279780479931131,0, 0, 0, 0, 0, 0, 0, 0, 0), X06.2007_P = c(0, 0, 0, 0, 0,0, 0, 0, 0.0314032417808054, 0), X07.2007_P = c(0, 0, 0.0430534856764365,0.0451075641915337, 0, 0.0411314307409985, 0, 0, 0, 0.0329205601559928), X08.2007_P = c(0, 0, 0, 0, 0.0418060200668896, 0, 0, 0.0322436628801032,0, 0), X09.2007_P = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X10.2007_P = c(0,0, 0, 0, 0, 0, 0, 0, 0, 0), X11.2007_P = c(0, 0, 0, 0, 0,0, 0, 0, 0, 0), X12.2007_P = c(0, 0, 0, 0, 0, 0, 0, 0, 0,0), X01.2008_P = c(0, 0, 0, 0.0451075641915337, 0, 0, 0,0, 0, 0), X02.2008_P = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X03.2008_P = c(0.0279780479931131,0.0247567176401135, 0, 0, 0, 0, 0, 0, 0, 0), X04.2008_P = c(0,0, 0, 0.0451075641915337, 0.0418060200668896, 0.0411314307409985,0.0348021630882904, 0, 0, 0), X05.2008_P = c(0, 0, 0, 0,0, 0, 0, 0, 0, 0), X06.2008_P = c(0.0559560959862262, 0,0.0430534856764365, 0, 0, 0, 0, 0, 0, 0), X07.2008_P = c(0.0279780479931131,0, 0, 0, 0, 0, 0, 0, 0, 0), X08.2008_P = c(0.0279780479931131,0, 0, 0, 0, 0, 0, 0, 0, 0), X09.2008_P = c(0.0279780479931131,0, 0, 0, 0, 0, 0, 0.0322436628801032, 0, 0.0329205601559928), X10.2008_P = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X11.2008_P = c(0,0, 0, 0, 0, 0, 0, 0, 0.0314032417808054, 0), X12.2008_P = c(0,0, 0, 0, 0, 0, 0.0348021630882904, 0, 0, 0), X01.2009_P = c(0,0, 0, 0.0451075641915337, 0, 0, 0, 0, 0, 0), X02.2009_P = c(0,0, 0, 0, 0, 0, 0.0348021630882904, 0, 0, 0), X03.2009_P = c(0,0, 0, 0, 0, 0, 0, 0, 0, 0), X04.2009_P = c(0, 0, 0, 0, 0,0, 0, 0, 0, 0), X05.2009_P = c(0, 0, 0, 0, 0.0418060200668896,0, 0, 0, 0, 0), X06.2009_P = c(0, 0, 0.0430534856764365,0, 0, 0, 0, 0, 0, 0), X07.2009_P = c(0.0279780479931131,0.0247567176401135, 0, 0, 0, 0, 0, 0, 0.0314032417808054,0), X08.2009_P = c(0, 0, 0, 0, 0, 0.0411314307409985, 0,0, 0, 0), X09.2009_P = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0329205601559928), X10.2009_P = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X11.2009_P = c(0.0279780479931131,0, 0, 0, 0, 0, 0, 0.0322436628801032, 0, 0), X12.2009_P = c(0,0, 0, 0, 0, 0, 0, 0, 0.0314032417808054, 0), X01.2010_P = c(0,0, 0, 0, 0, 0, 0, 0, 0, 0), X02.2010_P = c(0, 0, 0, 0, 0,0, 0, 0, 0, 0), X03.2010_P = c(0, 0, 0, 0, 0, 0, 0, 0, 0,0), X04.2010_P = c(0.0279780479931131, 0, 0, 0, 0, 0, 0,0, 0, 0), X05.2010_P = c(0, 0.049513435280227, 0, 0, 0, 0,0, 0, 0.0314032417808054, 0), X06.2010_P = c(0, 0, 0, 0,0.0418060200668896, 0, 0, 0, 0, 0), X07.2010_P = c(0, 0,0, 0.0451075641915337, 0, 0, 0, 0, 0, 0), X08.2010_P = c(0,0, 0, 0, 0, 0, 0, 0, 0, 0), X09.2010_P = c(0, 0, 0, 0, 0,0.0822628614819971, 0, 0, 0, 0), X10.2010_P = c(0, 0, 0,0, 0, 0, 0, 0.0322436628801032, 0, 0), X11.2010_P = c(0,0, 0, 0, 0, 0, 0.0348021630882904, 0, 0, 0), X12.2010_P = c(0,0, 0.086106971352873, 0, 0, 0, 0, 0, 0, 0.0329205601559928), X01.2011_P = c(0.0279780479931131, 0, 0, 0, 0, 0.0411314307409985,0, 0, 0, 0), X02.2011_P = c(0, 0.0247567176401135, 0, 0,0, 0, 0, 0.0322436628801032, 0, 0), X03.2011_P = c(0, 0,0, 0, 0.0418060200668896, 0, 0, 0, 0, 0), X04.2011_P = c(0,0, 0, 0.0451075641915337, 0, 0, 0, 0, 0.0314032417808054,0), X05.2011_P = c(0, 0, 0, 0, 0, 0.0411314307409985, 0.0348021630882904,0, 0, 0), X06.2011_P = c(0, 0, 0.0430534856764365, 0, 0,0, 0, 0, 0, 0), X07.2011_P = c(0, 0, 0, 0, 0, 0, 0, 0, 0,0.0658411203119856), X08.2011_P = c(0, 0, 0, 0, 0, 0, 0,0, 0, 0), X09.2011_P = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X10.2011_P = c(0,0, 0, 0, 0, 0, 0, 0, 0, 0), X11.2011_P = c(0, 0, 0, 0, 0,0, 0, 0, 0, 0), X12.2011_P = c(0, 0, 0, 0, 0, 0, 0, 0, 0,0), X01.2012_P = c(0, 0.0247567176401135, 0, 0, 0, 0, 0,0, 0, 0.0329205601559928), X02.2012_P = c(0, 0.0742701529203405,0, 0, 0, 0.0411314307409985, 0, 0, 0, 0), X03.2012_P = c(0,0, 0, 0.0451075641915337, 0, 0, 0, 0, 0, 0), X04.2012_P = c(0,0, 0, 0.0902151283830673, 0, 0, 0, 0, 0, 0), X05.2012_P = c(0,0, 0.0430534856764365, 0, 0, 0, 0, 0, 0, 0), X06.2012_P = c(0,0, 0, 0, 0.0836120401337793, 0, 0, 0, 0.0314032417808054,0), X07.2012_P = c(0, 0, 0, 0, 0.0418060200668896, 0, 0,0, 0, 0), X08.2012_P = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), X09.2012_P = c(0,0, 0, 0, 0, 0, 0, 0.0322436628801032, 0, 0), X10.2012_P = c(0,0, 0, 0.0451075641915337, 0, 0, 0, 0, 0.0628064835616107,0), X11.2012_P = c(0, 0, 0, 0, 0, 0, 0.0348021630882904,0, 0, 0), X12.2012_P = c(0.0279780479931131, 0, 0, 0, 0,0, 0, 0, 0, 0), X01.2013_P = c(0, 0, 0, 0, 0, 0, NA, 0, 0,NA), X02.2013_P = c(0.0279780479931131, 0, 0, 0, 0, 0, NA,0, 0, NA), X03.2013_P = c(0, 0, 0.0430534856764365, 0.0451075641915337,0, 0, NA, 0, 0, NA), X04.2013_P = c(0, 0, 0, 0, 0, 0.0411314307409985,NA, 0, 0, NA), X05.2013_P = c(0, 0, 0, 0, 0, 0, NA, 0, 0,NA), X06.2013_P = c(0.0279780479931131, 0, 0, 0, 0, 0, NA,0, 0, NA), X07.2013_P = c(0, 0, 0, 0, 0, 0, NA, 0, 0, NA),X08.2013_P = c(0, 0, 0, 0, 0, 0, NA, 0.0322436628801032,0.0314032417808054, NA), X09.2013_P = c(0.0279780479931131,0.049513435280227, 0, 0, 0.0418060200668896, 0, NA, 0.0322436628801032,0, NA), X10.2013_P = c(0.0279780479931131, 0, 0, 0, 0, 0,NA, 0, 0, NA), X11.2013_P = c(0, 0, 0, 0, 0, 0, NA, 0, 0,NA), X12.2013_P = c(0, 0, 0, 0, 0, 0, NA, 0, 0, NA), X01.2014_P = c(0,0, 0, 0, 0.0418060200668896, 0.0411314307409985, NA, 0, 0,NA), X02.2014_P = c(0, 0, 0, 0, 0, 0, NA, 0, 0, NA), X03.2014_P = c(0,0, 0, 0, 0, 0, NA, 0, 0, NA), X04.2014_P = c(0, 0, 0, 0,0, 0, NA, 0, 0, NA), X05.2014_P = c(0, 0, 0.0430534856764365,0, 0, 0, NA, 0.0322436628801032, 0.0314032417808054, NA),X06.2014_P = c(0, 0, 0, 0, 0, 0, NA, 0, 0, NA), X07.2014_P = c(0,0, 0, 0, 0, 0, NA, 0, 0, NA), X08.2014_P = c(0, 0, 0, 0,0, 0, NA, 0, 0, NA), X09.2014_P = c(0.0279780479931131, 0.0247567176401135,0, 0, 0, 0, NA, 0, 0, NA), X10.2014_P = c(0, 0.0247567176401135,0, 0.0451075641915337, 0, 0, NA, 0, 0, NA), X11.2014_P = c(0,0, 0, 0, 0, 0, NA, 0, 0, NA), X12.2014_P = c(0, 0.0247567176401135,0, 0, 0, 0, NA, 0.0644873257602064, 0, NA), X01.2015_P = c(0,0, 0, 0, 0, 0, NA, 0, 0, NA), X02.2015_P = c(0, 0, 0, 0,0, 0, NA, 0, 0, NA), X03.2015_P = c(0, 0, 0, 0, 0, 0, NA,0, 0, NA), X04.2015_P = c(0, 0, 0, 0.0451075641915337, 0.0418060200668896,0.0822628614819971, NA, 0, 0, NA), X05.2015_P = c(0, 0, 0,0, 0, 0, NA, 0, 0, NA), X06.2015_P = c(0, 0.0247567176401135,0, 0, 0, 0, NA, 0, 0, NA), X07.2015_P = c(0, 0, 0, 0, 0,0, NA, 0, 0, NA), X08.2015_P = c(0.0559560959862262, 0, 0.0430534856764365,0, 0, 0, NA, 0, 0.0314032417808054, NA), X09.2015_P = c(0,0, 0, 0, 0, 0, NA, 0, 0, NA), X10.2015_P = c(0, 0, 0, 0,0, 0, NA, 0, 0, NA), X11.2015_P = c(0, 0, 0, 0, 0, 0, NA,0.0322436628801032, 0, NA), X12.2015_P = c(0, 0.0247567176401135,0, 0, 0, 0, NA, 0, 0, NA)), row.names = c(1L, 2L, 3L, 4L,5L, 6L, 7L, 171L, 172L, 173L), class = "data.frame")
==X==============================================================X==
I used this code for plotting:
months_total_f052$MUN_RESID <- as.factor(months_total_f052$MUN_RESID)
barchart(months_total_f052, X08.2005_P+X09.2005_P+X10.2005_P+X11.2005_P+X12.2005_P+X01.2006_P+X02.2006_P+X03.2006_P+X04.2006_P+X05.2006_P+X06.2006_P+X07.2006_P+X08.2006_P+X09.2006_P+X10.2006_P+X11.2006_P+X12.2006_P+
X01.2007_P+X02.2007_P+X03.2007_P+X04.2007_P+X05.2007_P+X06.2007_P+X07.2007_P+X08.2007_P+X09.2007_P+X10.2007_P+X11.2007_P+X12.2007_P+
X01.2008_P+X02.2008_P+X03.2008_P+X04.2008_P+X05.2008_P+X06.2008_P+X07.2008_P+X08.2008_P+X09.2008_P+X10.2008_P+X11.2008_P+X12.2008_P+
X01.2009_P+X02.2009_P+X03.2009_P+X04.2009_P+X05.2009_P+X06.2009_P+X07.2009_P+X08.2009_P+X09.2009_P+X10.2009_P+X11.2009_P+X12.2009_P+
X01.2010_P+X02.2010_P+X03.2010_P+X04.2010_P+X05.2010_P+X06.2010_P+X07.2010_P+X08.2010_P+X09.2010_P+X10.2010_P+X11.2010_P+X12.2010_P+
X01.2011_P+X02.2011_P+X03.2011_P+X04.2011_P+X05.2011_P+X06.2011_P+X07.2011_P+X08.2011_P+X09.2011_P+X10.2011_P+X11.2011_P+X12.2011_P+
X01.2012_P+X02.2012_P+X03.2012_P+X04.2012_P+X05.2012_P+X06.2012_P+X07.2012_P+X08.2012_P+X09.2012_P+X10.2012_P+X11.2012_P+X12.2012_P+
X01.2013_P+X02.2013_P+X03.2013_P+X04.2013_P+X05.2013_P+X06.2013_P+X07.2013_P+X08.2013_P+X09.2013_P+X10.2013_P+X11.2013_P+X12.2013_P+
X01.2014_P+X02.2014_P+X03.2014_P+X04.2014_P+X05.2014_P+X06.2014_P+X07.2014_P+X08.2014_P+X09.2014_P+X10.2014_P+X11.2014_P+X12.2014_P+
X01.2015_P+X02.2015_P+X03.2015_P+X04.2015_P+X05.2015_P+X06.2015_P+X07.2015_P+X08.2015_P+X09.2015_P+X10.2015_P+X11.2015_P+X12.2015_P ~ MUN_RESID, data = months_total_f052, auto.key = list(space = 'left'), horiz = FALSE, ylab="percent_dead", scales=list(x=list(rot=90)))
This might be what you are after? I think with a ton of data "small multiples" is a decent approach. Without your full data set I don't think you'll get the picture but worth a try:
library(tidyverse)
dat <- tribble(
~"MUN_RESID", ~"X08.2005_P", ~"X09.2005_P", ~"X10.2005_P", ~"X11.2005_P", ~"X12.2005_P",
120043, 0.00000000, 0.22382438, 0.02797805, 0.00000000, 0.00000000,
150775, 0.00000000, 0.02475672, 0.00000000, 0.00000000, 0.00000000,
170025, 0.00000000, 0.00000000, 0.00000000, 0.04305349, 0.00000000,
170382, 0.04510756, 0.00000000, 0.00000000, 0.00000000, 0.00000000,
171180, 0.00000000, 0.04180602, 0.00000000, 0.00000000, 0.00000000,
171525, 0.04113143, 0.00000000, 0.00000000, 0.00000000, 0.00000000,
172025, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.03480216,
)
# We need to convert the data long ways
long_data <- dat %>%
gather(date, value, - MUN_RESID) %>%
separate(col = date, into = c("month", "year"), sep = "\\.") %>%
mutate(month = str_extract(month, "\\d+") %>% parse_double()) %>%
mutate(year = str_extract(year, "\\d+") %>% parse_double()) %>%
mutate(my_month = factor(month))
# Now we can graph
long_data %>%
ggplot(aes(MUN_RESID, value)) +
facet_wrap(~year)+
geom_col(aes(fill = my_month), position = "dodge", stat="identity")+
coord_flip()+
theme_minimal()
I have a data frame consisting of 1200 columns. I would like to reduce this to a more manageable number, and thought of performing a cross correlation between each of the columns, and combining columns which have a r>0.9.
here is the result from dput(head(a, 20))
ClassTechnology..Writing.instruments = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), ClassTechnology..Ink.and.lead.refills = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
ClassTechnology..Printing.machinery.and.equipment = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
ClassTechnology..Printing.machinery.accessories = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), ClassTechnology..Printing.accessories = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
ClassTechnology..Book.binding.and.sewing.equipment.and.accessories = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
ClassTechnology..Printing.laboratory.equipment.and.accessories = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
ClassTechnology..Lecterns.and.sound.systems.and.accessories = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), ClassTechnology..Projectors.and.supplies = c(0,
0, 0, 0, 0, 0, 0, 0, 0.28903076, 0, 0, 0, 0, 0, 0.086521352,
0, 0, 0, 0, 0), ClassTechnology..Audio.presentation.and.composing.equipment.and.hardware.and.controllers = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
ClassTechnology..Video.and.combination.video.and.audio.presentation.equipment.and.hardware.and.controllers = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
ClassTechnology..Phone.and.video.conference.equipment.and.hardware.and.controllers = c(0.0695846,
0, 0, 0.2, 0, 0, 0, 0, 0.190946024, 0, 0.2, 0, 0, 0, 0.086521352,
0.2, 0, 0, 0, 0), ClassTechnology..Microfilm.equipment.and.supplies = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
ClassTechnology..Cameras = c(0.8608308, 0, 0, 0.2, 0, 0,
0.25, 0, 0, 0, 0.2, 0.792574627, 0.25, 0.25, 0.913478648,
0.2, 0, 0, 1, 0), ClassTechnology..Camera.accessories = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.086521352, 0, 0,
0, 0, 0), ClassTechnology..Photographic.processing.equipment = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
ClassTechnology..Microfilm.production.equipment.and.supplies = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
ClassTechnology..Darkroom.supplies = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), ClassTechnology..Firearms = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.740435943, 0, 0,
0, 0, 0), ClassTechnology..Arms.and.ammunition.accessories = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L)
ClassTechnology..Video.and.combination.video.and.audio.presentation.equipment.and.hardware.and.controllers = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
ClassTechnology..Phone.and.video.conference.equipment.and.hardware.and.controllers = c(0.0695846,
0, 0, 0.2, 0, 0, 0, 0, 0.190946024, 0, 0.2, 0, 0, 0, 0.086521352,
0.2, 0, 0, 0, 0), ClassTechnology..Microfilm.equipment.and.supplies = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
ClassTechnology..Cameras = c(0.8608308, 0, 0, 0.2, 0, 0,
0.25, 0, 0, 0, 0.2, 0.792574627, 0.25, 0.25, 0.913478648,
0.2, 0, 0, 1, 0),
I ran the following code:
df=NULL
for(i in 22:ncol(a)){
for(j in 22:ncol(a)){
if (j<=i){next}
df=rbind(df,cbind(colnames(a)[i],colnames(a)[j],cor(a[,i],a[,j])))
}
}
df=as.data.frame(df)
df[,3]=as.numeric(as.character(df[,3]))
But it takes an incredibly long time (>2hours now), and wonder if anyone has any suggestions as to how I can improve this calculation, or a different way to reduce the dimensions
I would like to find the smallest distance between the profiles stored in a data frame. I am interested especially in one row in comparison to the rest of the rows stored in the data frame.
That's a data frame:
structure(list(`10` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `34` = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 393090, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6718400,
0, 311350, 0), `59` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2164949.7,
4834137.6, 0, 0, 0, 1187816.7, 0, 0, 0, 0, 0, 0, 1340912.5, 0
), `84` = c(0, 0, 0, 0, 0, 0, 0, 0, 8607100, 0, 0, 17586713.2,
22629743.6, 0, 0, 0, 2808791.7, 0, 0, 4026222.5, 0, 0, 0, 1981900,
0), `110` = c(2296000, 0, 0, 0, 0, 2140221.7, 0, 0, 5809230.6,
0, 0, 37134898.5, 3861828.7, 2553100, 0, 12075845.8, 0, 0, 1272950,
8695273, 0, 0, 2657180, 2710080, 0), `134` = c(0, 0, 0, 1176150,
0, 1329596.7, 1471000, 0, 6511934, 6511934, 0, 18709227.3, 0,
1041211.2, 0, 6544176.9, 0, 0, 2412651.7, 7724956.9, 2878418.3,
0, 8620131.7, 2386972.8, 0), `165` = c(0, 1226610, 0, 1345098.7,
2083771.9, 0, 1808231.4, 0, 0, 10742997.7, 0, 13060798.9, 0,
538340, 538340, 2791649.5, 0, 0, 6217622, 1316097.1, 4716931.8,
0, 6615816.9, 1510532, 0), `199` = c(0, 1571525, 0, 1903038.3,
1676700, 0, 888832.2, 0, 0, 9084418.6, 0, 11189460.1, 0, 0, 1807662.5,
2564275, 0, 0, 18080359.7, 0, 0, 0, 2397710.2, 1717949.2, 0),
`234` = c(0, 1314900, 2482696, 1325684, 0, 0, 0, 0, 0, 7321432.7,
0, 9843409.2, 0, 0, 1073341.7, 2762775, 0, 0, 9335312.8,
0, 0, 0, 1950788.2, 1509100, 0), `257` = c(0, 1568700, 14604298.7,
940162.2, 0, 0, 0, 0, 0, 4779505.9, 0, 9691692.4, 0, 0, 735290,
2650165, 0, 2311383.7, 5193383.4, 0, 0, 0, 1341998.7, 1225325.6,
0), `362` = c(0, 0, 4190740.5, 288800, 0, 0, 0, 0, 0, 4846634.8,
0, 9574498.7, 0, 0, 0, 1425600, 0, 8339312.1, 3877892.5,
0, 0, 0, 1752866.7, 0, 0), `433` = c(0, 0, 773280, 0, 0,
0, 0, 0, 0, 3926582.8, 3926582.8, 5962586.5, 0, 0, 0, 1041400,
0, 1972909.3, 1895439.4, 0, 0, 0, 963891.2, 0, 1109800),
`506` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9332272, 0, 0, 0,
0, 0, 0, 2219100, 0, 0, 0, 0, 0, 0, 0), `581` = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 4371537.1, 0, 0, 0, 0, 0, 0, 2428800,
0, 0, 0, 0, 0, 0, 0), `652` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1689871.4, 0, 0, 0, 0, 0, 0, 988399.7, 0, 0, 0, 0, 0,
0, 0), `733` = c(0, 0, 0, 0, 0, 0, 0, 1250100, 0, 0, 1754205.3,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `818` = c(0, 0,
0, 0, 0, 0, 0, 517340, 0, 0, 1149227.6, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), `896` = c(0, 0, 0, 0, 0, 0, 0, 579846.7,
0, 0, 985931.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
`972` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 858255.5, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1039` = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 848993.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0)), .Names = c("10", "34", "59", "84", "110", "134",
"165", "199", "234", "257", "362", "433", "506", "581", "652",
"733", "818", "896", "972", "1039"), row.names = c("Mark_1",
"Mark_2", "Alex_1", "Katrin_1", "Georg_1", "Martin_1",
"Tim_1", "Tom_1", "Mike_1", "Mike_2", "Mike_3",
"Hare_1", "Dea_1", "Monty_1", "Monty_2", "Niko_1",
"Lee_1", "Marq_1", "Otto_1", "Priaq_1", "Surkta_1",
"Norsa_1", "Norsa_2", "Quer_1", "Quer_2"), class = "data.frame")
So the row named Katrin_1 is the one which is interesting for me. I would like to find which rows have the smallest euclidean distance to Katrin_1. Let say 3-5 rows.
Let's get rid of Katrin_1 column with df[!rownames(df) %in% "Katrin_1", ], subtract df["Katrin_1", ] from each of the remaining rows with sweep, find Euclidean distances by squaring the resulting matrix element-wise and using rowSums, use which.min to get the final result:
names(which.min(rowSums(sweep(df[!rownames(df) %in% "Katrin_1", ], 2, as.numeric(df["Katrin_1", ]), `-`)^2)))
# [1] "Mark_2"
This should be much more efficient than using dist as dist would compute all possible distances, while we need need only a few.