Applying a function inside a dplyr pipe command - r

I am trying to apply a Trim function from the DescTools package to a data frame in R using the dplyr package.
What I have so far is the following:
x <- df %>%
group_by(Country) %>%
mutate_all(OfferPrice, Trim(trim = 0.1, na.rm = TRUE))
Which returns the following error:
Error in Trim(trim = 0.1, na.rm = TRUE) :
argument "x" is missing, with no default
I know its a problem with the characteristics inside the Trim() part of the mutate but I cannot seem to apply this function inside dplyr.
The function trims the top and bottom 10% of the observations, hopefully removing any extreme values.
Data:
df <- structure(list(Country = c("AU", "AU", "AU", "AU", "AU", "AU",
"AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU",
"AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU",
"AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU",
"AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU",
"AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU", "AU",
"AU", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA",
"CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA",
"CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA",
"CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA",
"CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA",
"CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA",
"CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA",
"CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA",
"CA", "CA", "CA", "CA", "CA", "CA", "GB", "GB", "GB", "GB", "GB",
"GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB",
"GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB",
"GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB",
"GB", "GB", "GB", "GB", "GB", "GB"), OfferPrice = c(0.25, 0.55,
0.065, 0.075, 0.019, 0.0114, 0.18, 0.015, 2.8, 3.62, 0.025, 0.07,
0.6, 0.9, 0.12, 2.72, 0.015, 0.015, 0.32, 0.2, 0.063, 0.01, 1.42,
0.0045, 0.02, 1.15, 0.2, 17.05, 0.009, 1.8, 3.22, 0.135, 0.35,
5, 0.37, 0.023, 0.014, 0.023, 0.35, 1.25, 0.05, 0.059, 0.2, 0.025,
5.45, 0.05, 0.3, 0.22, 0.04, 0.035, 2, 0.32, 0.2, 0.2, 0.02,
0.34, 0.04, 0.025, 0.03, 0.0125, 1.6, 0.03, 0.15, 13.5, 0.1,
0.3, 0.13, 0.115, 0.35, 0.2, 0.6, 0.7, 8, 14, 25, 15.75, 3.8,
2, 0.5, 35.2, 1.75, 0.12, 0.48, 0.15, 0.7, 0.075, 0.15, 14.5,
0.29, 0.58, 1.75, 9, 11.5, 0.5, 0.075, 0.12, 1.1, 0.6, 0.75,
0.26, 0.2, 0.12, 0.49, 12, 6.85, 0.55, 0.25, 1.6, 0.36, 0.06,
2, 0.272, 41, 0.15, 1.1, 4.1, 0.6, 0.08, 1.4, 3, 0.09, 0.15,
0.2, 0.3, 0.8, 0.21, 0.1, 0.05, 0.17, 0.1, 0.15, 0.05, 0.3, 0.6,
0.2, 0.5, 3.45, 3, 0.07, 0.1, 0.3, 7.2, 0.4, 0.1, 12.5, 0.07,
0.375, 0.25, 0.3, 1.15, 0.2, 3, 1, 0.3, 0.25, 530, 262, 20, 37.5,
3422, 295, 100, 0.085, 1925, 0.3, 107.5, 10, 2.1, 3, 15, 300,
690, 50, 410, 100, 120, 225, 40, 100, 100, 51, 10, 82, 9.58,
269, 0.5, 271, 100, 108, 0.3, 4.5, 0.5, 0.55, 50, 0.95, 275,
100, 170, 0.7), OfferTo1stOpen = c(18, -2.727274895, 9.230772972,
6.666662216, -15.78947067, 5.263155937, -2.777781725, 13.33333588,
5.000001907, -3.591157198, -0.000001490116119, 1.428570986, -4.166670322,
0.00000264909545, -34.16666412, -0.000001051846652, 26.66666985,
26.66666985, 9.375002861, 2.499998569, 6.34920454, 0.000002235174179,
-0.7042223215, -11.11110687, 15.00000286, 1.304349899, -0.000001490116119,
6.217013359, 11.11111546, 25.00000381, 0.9316761494, -0.000003973642833,
-15.71428394, 17.20000076, -0.000001288749104, 4.347826004, 14.28571033,
13.04347801, 4.285716057, 43.20000076, 1.99999845, 10.16949081,
2.499998569, -4.000001431, -0.1834827513, 11.99999809, -1.666670561,
95.45454407, -12.49999809, 25.7142849, -0.5, 18.75000191, -0.000001490116119,
-17.50000191, -9.999998093, 44.11764526, 15.00000286, 19.99999809,
0.000002235174179, 35.99999619, 10.62499809, 76.66667175, 6.666662216,
-0.3703703582, -10.00000095, -100, 146.1538544, 65.21739197,
-11.42856979, 14.99999809, -5.000003815, -11.42856979, 1.625,
6.785714149, NA, 3.492063522, -3.684209347, -2.5, 10, -1.420456648,
1.142857194, -12.49999809, -1.041664481, -0.000003973642833,
-14.2857132, 39.99999619, 36.66666031, -0.3448275924, -15.51723862,
-12.06896305, -18.2857151, 0.555555582, -5.434782505, 590, -6.666670322,
0.000002235174179, 1.818179607, 36.66666031, -6.666666508, 0.000003667978262,
-10.00000095, 20.83333588, -20.40816498, -2.916666746, -29.1970787,
-0.000002167441608, -10, -18.80635834, -100, 8.333335876, -3.5,
10.29411125, 2.097560883, -6.666670322, 7.272725105, 0.7317096591,
19.99999619, 81.25000763, 45.00000381, -20, -11.1111145, -0.000003973642833,
-7.500001431, -0.000003973642833, -1.250001431, -14.28571129,
49.99999619, -10.00000095, -5.882353783, NA, 23.33332825, 19.99999809,
18.33332825, -13.33333683, 34.99999619, -34, -19.71014595, -32.33333206,
-21.4285717, -20.00000191, -100, 0.1388915479, 7.499998569, -20.00000191,
-0.2399999946, 257.1428528, -16, 54, NA, -4.347824097, -100,
6, 1, 4.999995708, -8, 8.301886559, 3.511450291, 25, 16, -1.461133838,
-1.694915295, 1, 17.64705849, 3.376623392, 24.99999428, 3.255813837,
34, 0.00000454130668, -3.333333254, 10.33333302, 1.666666627,
16.231884, 9, 1.829268336, 3, 11.66666698, 4.888888836, 14.25,
3.5, 3.5, -4.411764622, 0.200000003, 1.829268336, 53.96659851,
9.665427208, 5, -1.586715817, 2, 1.111111164, 4.999995708, -10,
5, -4.545456409, NA, 7.894738197, 5.454545498, 1, 11.17647076,
25.00000191), OfferTo1stClose = c(8, -7.272729397, 9.230772972,
7.999995708, -21.05262947, -3.508773565, -2.777781725, 0.000002235174179,
3.571430445, -3.867400169, -0.000001490116119, 1.428570986, -6.666670322,
-1.666664004, -35.83333206, -3.308824539, 13.33333588, 26.66666985,
10.93750286, -0.000001490116119, 6.34920454, -9.999998093, -0.3521096706,
11.11111546, 5.000002384, -0.4347805381, -2.500001431, 3.519066334,
11.11111546, 27.22222519, 4.34782505, -7.407411098, -17.1428566,
15.39999962, 4.05405283, -0.0000001943629684, 7.142853737, 13.04347801,
2.857144594, 43.20000076, 3.999998569, 10.16949081, -7.500001431,
3.999998569, -0.5504552126, 19.99999809, -1.666670561, 170.4545441,
-14.99999809, 31.4285717, -0.5, 18.75000191, -20.00000191, -17.50000191,
0.000002235174179, 44.11764526, 12.50000286, 15.99999809, 3.333335638,
35.99999619, 10.62499809, 123.3333359, 13.3333292, -1.481481433,
-10.00000095, -100, 138.4615479, 47.82608414, -12.85714149, 32.49999619,
-13.33333683, -24.2857132, 1.75, -0.3571428657, NA, 3.93650794,
-7.894735813, -7, 20, -0.9375021458, 1.714285731, -8.333331108,
-1.041664481, 3.333329201, -19.99999809, 33.33332825, 33.33332825,
-0.06896551698, -16.3793087, -16.3793087, -18.2857151, 2.666666746,
2.173913002, 590, -6.666670322, -16.66666412, 2.727270603, 44.99999237,
-10.66666698, 1.923080683, -12.50000095, 16.66666985, -22.44898033,
-4.166666508, -39.85401535, -3.636365652, -12, -16.8959198, -100,
0.000002235174179, -3.5, 13.97058201, 2.707317114, -8.066670418,
5.454543114, 0.4878072143, 19.99999619, 87.50000763, 45.7142868,
-25.66666603, -5.555559158, 16.66666222, -2.500001431, 3.333329201,
-0.000001490116119, -14.28571129, 49.99999619, -10.00000095,
-5.882353783, NA, 39.99999619, 19.99999809, 13.3333292, -10.00000381,
65, -26, -19.71014595, -31.66666603, -21.4285717, -20.00000191,
-100, -0.1388862431, 11.24999809, -20.00000191, -1.679999948,
228.5714264, -22.66666603, 42, NA, -7.826085091, -100, 6.666666508,
0, 4.999995708, -8, 8.301886559, 3.969465733, 26, 16, -5.084745884,
1.322033882, 1.5, 17.64705849, 2.077922106, 24.99999428, 3.255813837,
43, 0.00000454130668, -4.166666508, 10.33333302, 1.333333373,
18.69565201, 9, 1.829268336, 3, 11.66666698, 3.111111164, 15,
3.5, 3.5, -4.411764622, 0.6000000238, 50.60975647, 53.96659851,
37.54646683, 0, -0.1476014704, 3, 1.296296239, 4.999995708, -11.11111069,
5, -0.000002167441608, NA, 7.894738197, 4.181818008, 0.5, 10.88235283,
25.00000191)), row.names = c(NA, -199L), vars = "Country", drop = TRUE, indices = list(
0:61, 62:154, 155:198), group_sizes = c(62L, 93L, 44L), biggest_group_size = 93L, labels = structure(list(
Country = c("AU", "CA", "GB")), row.names = c(NA, -3L), class = "data.frame", vars = "Country", drop = TRUE, indices = list(
0:61, 62:154, 155:198), group_sizes = c(62L, 93L, 44L), biggest_group_size = 93L, labels = structure(list(
Country = c("AU", "CA", "GB")), row.names = c(NA, -3L), class = "data.frame", vars = "Country", drop = TRUE)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))

I think you'll need to do this with do since the action of Trim is to return essentially a subset of observations. Try:
x <- df %>%
group_by(Country) %>%
do(
Trim(.$OfferPrice, trim = 0.1, na.rm = TRUE)
)
You could then use lapply or map inside the do statement to Trim each column of data, but I'm not sure if this is actually what you want. It's unclear since you have not provided any sample data. The attempt to use mutate_all suggests you want to Trim each column of data separately, but this doesn't make sense to me.
EDIT based on your comment you really want to filter the dataframe by the Trimmed column OfferPrice, so
x <- df %>%
group_by(Country) %>%
do(
.[attr(Trim(.$OfferPrice, trim = 0.1, na.rm = TRUE), "trim"), ]
)
See the documentation of Trim for details, specifically
The indices of the trimmed values will be attached as attribute named "trim".

Assuming that what you want is that for any element of OfferPrice excluded by Trim(OfferPrice, ...) that entire row of df should be dropped, get the trim attribute of the result of Trim(...) and remove those rows using slice doing it all by Country.
library(dplyr)
library(DescTools)
df %>%
group_by(Country) %>%
slice(-attr(Trim(OfferPrice, trim = 0.1, na.rm = TRUE), "trim")) %>%
ungroup
This could also be written:
df %>%
group_by(Country) %>%
slice(OfferPrice %>%
Trim(trim = 0.1, na.rm = TRUE) %>%
attr("trim") %>%
`-`) %>%
ungroup

Related

Classifying the words as per emotions & counts in the song lyrics

library(rvest)
library(dplyr)
library(tidyr)
library(spotifyr)
library(tidytext)
library(textdata)
Using the above libraries I scraped artist data from Spotify using the API token.
I've got the data of words with sentiments (i.e. anger let's say) and the details about the songs.
I now want to run it in loop over multiple such word category (eg. anger) and see which words are most used in that particular emotion, and in general too I want to plot a histogram for the words used in the songs.
So I use the following functions:
data %>%
unnest() %>%
unnest_tokens(word, lyric) %>%
anti_join(stop_words, by = "word") %>%
left_join(angry_words, by = "word") %>%
group_by(track_name, energy, album_name, duration_ms, valence) %>%
summarize(angry_words = sum(anger, na.rm = TRUE)) %>%
ungroup() %>%
select(track_name, album_name, angry_words) %>%
arrange(desc(angry_words))
Every-time I run the code I get the following error:
Error in `fn()`:
! In row 64, can't recycle input of size 3 to size 2.
Run `rlang::last_error()` to see where the error occurred.
Warning message:
`cols` is now required when using unnest().
Please use `cols = c(album_images, artists, available_markets)`
All suggestions will be helpful.
Here the data and angry_words data frames are:
data <- structure(list(artist_name = c("María José Llergo", "María José Llergo"
), artist_id = c("70GBRlKEGjfueop2lfdQ4Q", "70GBRlKEGjfueop2lfdQ4Q"
), album_id = c("6BMyWViSAgXtUVlPfXiGES", "6BMyWViSAgXtUVlPfXiGES"
), album_type = c("album", "album"), album_images = list(structure(list(
height = c(640L, 300L, 64L), url = c("https://i.scdn.co/image/ab67616d0000b2735f3d845e18e06df1bbe95178",
"https://i.scdn.co/image/ab67616d00001e025f3d845e18e06df1bbe95178",
"https://i.scdn.co/image/ab67616d000048515f3d845e18e06df1bbe95178"
), width = c(640L, 300L, 64L)), class = "data.frame", row.names = c(NA,
3L)), structure(list(height = c(640L, 300L, 64L), url = c("https://i.scdn.co/image/ab67616d0000b2735f3d845e18e06df1bbe95178",
"https://i.scdn.co/image/ab67616d00001e025f3d845e18e06df1bbe95178",
"https://i.scdn.co/image/ab67616d000048515f3d845e18e06df1bbe95178"
), width = c(640L, 300L, 64L)), class = "data.frame", row.names = c(NA,
3L))), album_release_date = c("2020-01-31", "2020-01-31"), album_release_year = c(2020,
2020), album_release_date_precision = c("day", "day"), danceability = c(0.612,
0.5), energy = c(0.342, 0.267), key = c(4L, 7L), loudness = c(-9.193,
-11.736), mode = 0:1, speechiness = c(0.0419, 0.0448), acousticness = c(0.358,
0.815), instrumentalness = c(0.000502, 2.66e-06), liveness = c(0.257,
0.0981), valence = c(0.122, 0.264), tempo = c(99.993, 114.192
), track_id = c("7pB0e4E78UfAmKBPzQPo8a", "1sgH6adzL1BBaIXRC7NOYI"
), analysis_url = c("https://api.spotify.com/v1/audio-analysis/7pB0e4E78UfAmKBPzQPo8a",
"https://api.spotify.com/v1/audio-analysis/1sgH6adzL1BBaIXRC7NOYI"
), time_signature = 3:4, artists = list(structure(list(href = "https://api.spotify.com/v1/artists/70GBRlKEGjfueop2lfdQ4Q",
id = "70GBRlKEGjfueop2lfdQ4Q", name = "María José Llergo",
type = "artist", uri = "spotify:artist:70GBRlKEGjfueop2lfdQ4Q",
external_urls.spotify = "https://open.spotify.com/artist/70GBRlKEGjfueop2lfdQ4Q"), class = "data.frame", row.names = 1L),
structure(list(href = "https://api.spotify.com/v1/artists/70GBRlKEGjfueop2lfdQ4Q",
id = "70GBRlKEGjfueop2lfdQ4Q", name = "María José Llergo",
type = "artist", uri = "spotify:artist:70GBRlKEGjfueop2lfdQ4Q",
external_urls.spotify = "https://open.spotify.com/artist/70GBRlKEGjfueop2lfdQ4Q"), class = "data.frame", row.names = 1L)),
available_markets = list(c("AD", "AE", "AG", "AL", "AM",
"AO", "AR", "AT", "AU", "AZ", "BA", "BB", "BD", "BE", "BF",
"BG", "BH", "BI", "BJ", "BN", "BO", "BR", "BS", "BT", "BW",
"BY", "BZ", "CA", "CD", "CG", "CH", "CI", "CL", "CM", "CO",
"CR", "CV", "CW", "CY", "CZ", "DE", "DJ", "DK", "DM", "DO",
"DZ", "EC", "EE", "EG", "ES", "FI", "FJ", "FM", "FR", "GA",
"GB", "GD", "GE", "GH", "GM", "GN", "GQ", "GR", "GT", "GW",
"GY", "HK", "HN", "HR", "HT", "HU", "ID", "IE", "IL", "IN",
"IQ", "IS", "IT", "JM", "JO", "JP", "KE", "KG", "KH", "KI",
"KM", "KN", "KR", "KW", "KZ", "LA", "LB", "LC", "LI", "LK",
"LR", "LS", "LT", "LU", "LV", "LY", "MA", "MC", "MD", "ME",
"MG", "MH", "MK", "ML", "MN", "MO", "MR", "MT", "MU", "MV",
"MW", "MX", "MY", "MZ", "NA", "NE", "NG", "NI", "NL", "NO",
"NP", "NR", "NZ", "OM", "PA", "PE", "PG", "PH", "PK", "PL",
"PS", "PT", "PW", "PY", "QA", "RO", "RS", "RW", "SA", "SB",
"SC", "SE", "SG", "SI", "SK", "SL", "SM", "SN", "SR", "ST",
"SV", "SZ", "TD", "TG", "TH", "TJ", "TL", "TN", "TO", "TR",
"TT", "TV", "TW", "TZ", "UA", "UG", "US", "UY", "UZ", "VC",
"VE", "VN", "VU", "WS", "XK", "ZA", "ZM", "ZW"), c("AD",
"AE", "AG", "AL", "AM", "AO", "AR", "AT", "AU", "AZ", "BA",
"BB", "BD", "BE", "BF", "BG", "BH", "BI", "BJ", "BN", "BO",
"BR", "BS", "BT", "BW", "BY", "BZ", "CA", "CD", "CG", "CH",
"CI", "CL", "CM", "CO", "CR", "CV", "CW", "CY", "CZ", "DE",
"DJ", "DK", "DM", "DO", "DZ", "EC", "EE", "EG", "ES", "FI",
"FJ", "FM", "FR", "GA", "GB", "GD", "GE", "GH", "GM", "GN",
"GQ", "GR", "GT", "GW", "GY", "HK", "HN", "HR", "HT", "HU",
"ID", "IE", "IL", "IN", "IQ", "IS", "IT", "JM", "JO", "JP",
"KE", "KG", "KH", "KI", "KM", "KN", "KR", "KW", "KZ", "LA",
"LB", "LC", "LI", "LK", "LR", "LS", "LT", "LU", "LV", "LY",
"MA", "MC", "MD", "ME", "MG", "MH", "MK", "ML", "MN", "MO",
"MR", "MT", "MU", "MV", "MW", "MX", "MY", "MZ", "NA", "NE",
"NG", "NI", "NL", "NO", "NP", "NR", "NZ", "OM", "PA", "PE",
"PG", "PH", "PK", "PL", "PS", "PT", "PW", "PY", "QA", "RO",
"RS", "RW", "SA", "SB", "SC", "SE", "SG", "SI", "SK", "SL",
"SM", "SN", "SR", "ST", "SV", "SZ", "TD", "TG", "TH", "TJ",
"TL", "TN", "TO", "TR", "TT", "TV", "TW", "TZ", "UA", "UG",
"US", "UY", "UZ", "VC", "VE", "VN", "VU", "WS", "XK", "ZA",
"ZM", "ZW")), disc_number = c(1L, 1L), duration_ms = c(197316L,
313028L), explicit = c(FALSE, FALSE), track_href = c("https://api.spotify.com/v1/tracks/7pB0e4E78UfAmKBPzQPo8a",
"https://api.spotify.com/v1/tracks/1sgH6adzL1BBaIXRC7NOYI"
), is_local = c(FALSE, FALSE), track_name = c("¿De Qué Me Sirve Llorar?",
"Niña De Las Dunas"), track_preview_url = c("https://p.scdn.co/mp3-preview/1ed3fba536f1813af99c88f69893dfe6272df847?cid=cf686ca455c74783b8f27d0c35dfc5b0",
"https://p.scdn.co/mp3-preview/e4f9386ef79ff5027800aa9ccd8560a622df28d0?cid=cf686ca455c74783b8f27d0c35dfc5b0"
), track_number = 1:2, type = c("track", "track"), track_uri = c("spotify:track:7pB0e4E78UfAmKBPzQPo8a",
"spotify:track:1sgH6adzL1BBaIXRC7NOYI"), external_urls.spotify = c("https://open.spotify.com/track/7pB0e4E78UfAmKBPzQPo8a",
"https://open.spotify.com/track/1sgH6adzL1BBaIXRC7NOYI"),
album_name = c("Sanación", "Sanación"), key_name = c("E",
"G"), mode_name = c("minor", "major"), key_mode = c("E minor",
"G major")), row.names = 1:2, class = "data.frame")
angry_words <- structure(list(word = c("abandoned", "abandonment", "abhor",
"abhorrent", "abolish", "abomination", "abuse", "accursed", "accusation",
"accused", "accuser", "accusing", "actionable", "adder", "adversary",
"adverse", "adversity", "advocacy", "affront", "aftermath", "aggravated",
"aggravating", "aggravation", "aggression", "aggressive", "aggressor",
"agitated", "agitation", "agony", "alcoholism", "alienate", "alienation",
"allegation", "altercation", "ambush", "anarchism", "anarchist",
"anarchy", "anathema", "anger", "angry", "anguish", "animosity",
"animus", "annihilate", "annihilated", "annihilation", "annoy",
"annoyance", "annoying", "antagonism", "antagonist", "antagonistic",
"antichrist", "antipathy", "antisocial", "antithesis", "anxiety",
"argue", "argument", "argumentation", "arguments", "armament",
"armed", "arraignment", "arrogant", "arson", "assail", "assailant",
"assassin", "assassinate", "assassination", "assault", "asshole",
"atrocious", "atrocity", "attack", "attacking", "attorney", "avarice"
), anger = c(TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,
TRUE, TRUE, TRUE, TRUE, TRUE, TRUE)), row.names = c(NA, -80L), class = c("tbl_df",
"tbl", "data.frame"))

Compare the sales of 2015 and 2016 with Tier and Region from the following dataframe in R

Dataframe
Output to be displayed
I have written this code, but not getting the desired output as above:
dataframe%>%
group_by(Region,Tier)%>%
summarise(TotalSales2015=sum(Sales2015),TotalSales2016=sum(Sales2016))%>%
ggplot(aes(x=Tier, y=Sales, fill=Year)) +
geom_bar(stat="identity", position=position_dodge())+
geom_text(aes(label=Sales))
dput(dataframe)
structure(list(AccountId = c(1116L, 1116L, 2391L, 2391L, 2397L,
2400L, 2400L, 2404L, 2406L, 2408L), AccountName = c("Account1",
"Account1", "Account2", "Account2", "Account3", "Account4", "Account4",
"Account5", "Account6", "Account7"), Region = c("West", "West",
"East", "East", "East", "East", "East", "East", "East", "East"
), Division = c("DIAMONDBACK", "DIAMONDBACK", "MINUTEMEN", "MINUTEMEN",
"MINUTEMEN", "MINUTEMEN", "MINUTEMEN", "EMPIRE", "BIG APPLE",
"BIG APPLE"), City = c("PHOENIX W", "PHOENIX W", "HARTFORD",
"HARTFORD", "WORCESTER", "PORTLAND", "PORTLAND", "BRIDGEPORT",
"JERSEY CITY", "JERSEY CITY"), State = c("AZ", "AZ", "CT", "CT",
"MA", "ME", "ME", "CT", "NJ", "NJ"), Tier = c("Low", "Low", "Med",
"Med", "Med", "High", "High", "Low", "Med", "High"), Month = c("Aug",
"Oct", "Jun", "Mar", "Sep", "Jul", "Feb", "Mar", "Mar", "Aug"
), Sales2015 = c(0, 10500.78, 0, 19881, 3684.48, 0, 2631.31,
4153.89, 0, 0), Sales2016 = c(13208.52, 23114.91, 6627, 13254,
0, 10525.24, 42812.62, 3918.77, 6951.86, 10994.54), Units2015 = c(0,
3, 0, 9, 1, 0, 1, 1, 0, 0), Units2016 = c(4, 7, 3, 6, 0, 4, 17,
1, 2, 4), TargetAchevied2015 = c(0.7, 0.84, 1.15, 1.33, 1.02,
1.03, 1.08, 0.79, 1.12, 1.11), TargetAchevied2016 = c(1.53, 1.31,
1.29, 1.17, 1.53, 1.45, 0.99, 1.46, 1.02, 1.54)), row.names = c(NA,
10L), class = "data.frame")
With tidyverse friends dpyr, tidyr, scales and ggplot you could try this:
library(dplyr)
library(tidyr)
library(ggplot2)
df <-
dataframe %>%
pivot_longer(cols = contains("20"), names_pattern = "(\\D*)(\\d{4})", names_to = c("metric", "Year")) %>%
pivot_wider(names_from = metric, values_from = value) %>%
group_by(Region, Tier, Year) %>%
summarise(Sales = sum(Sales))
ggplot(df, aes(x = Tier, y = Sales, fill = Year)) +
geom_col(position = position_dodge(width = 0.9))+
geom_text(aes(label = scales::comma(Sales)),
position = position_dodge(width = 0.9),
angle = 90,
hjust = 0)+
scale_y_continuous(expand = expansion(mult = c(0.06, 0.2)))+
labs(fill = "Total sales")+
facet_wrap(~Region, nrow = 1)
Created on 2021-08-29 by the reprex package (v2.0.0)
data
dataframe <- structure(list(AccountId = c(1116L, 1116L, 2391L, 2391L, 2397L,
2400L, 2400L, 2404L, 2406L, 2408L), AccountName = c("Account1",
"Account1", "Account2", "Account2", "Account3", "Account4", "Account4",
"Account5", "Account6", "Account7"), Region = c("West", "West",
"East", "East", "East", "East", "East", "East", "East", "East"
), Division = c("DIAMONDBACK", "DIAMONDBACK", "MINUTEMEN", "MINUTEMEN",
"MINUTEMEN", "MINUTEMEN", "MINUTEMEN", "EMPIRE", "BIG APPLE",
"BIG APPLE"), City = c("PHOENIX W", "PHOENIX W", "HARTFORD",
"HARTFORD", "WORCESTER", "PORTLAND", "PORTLAND", "BRIDGEPORT",
"JERSEY CITY", "JERSEY CITY"), State = c("AZ", "AZ", "CT", "CT",
"MA", "ME", "ME", "CT", "NJ", "NJ"), Tier = c("Low", "Low", "Med",
"Med", "Med", "High", "High", "Low", "Med", "High"), Month = c("Aug",
"Oct", "Jun", "Mar", "Sep", "Jul", "Feb", "Mar", "Mar", "Aug"
), Sales2015 = c(0, 10500.78, 0, 19881, 3684.48, 0, 2631.31,
4153.89, 0, 0), Sales2016 = c(13208.52, 23114.91, 6627, 13254,
0, 10525.24, 42812.62, 3918.77, 6951.86, 10994.54), Units2015 = c(0,
3, 0, 9, 1, 0, 1, 1, 0, 0), Units2016 = c(4, 7, 3, 6, 0, 4, 17,
1, 2, 4), TargetAchevied2015 = c(0.7, 0.84, 1.15, 1.33, 1.02,
1.03, 1.08, 0.79, 1.12, 1.11), TargetAchevied2016 = c(1.53, 1.31,
1.29, 1.17, 1.53, 1.45, 0.99, 1.46, 1.02, 1.54)), row.names = c(NA,
10L), class = "data.frame")

Filling in the values of a column based on matching strings from the column of another dataset [duplicate]

This question already has answers here:
How to join (merge) data frames (inner, outer, left, right)
(13 answers)
Closed 2 years ago.
I am working with the following two datasets :
will_can
structure(list(will_can.REGION = c("AB", "B", "B", "B", "BB",
"BB", "BD", "BH", "BH", "BH", "BR", "BS", "BS", "BT", "BT", "CF",
"CF", "CM", "CO", "CV", "CV", "CV", "CW", "DA", "DA", "DD", "DE",
"DE", "DG", "DG", "DG", "DG", "DL", "DN", "DT", "E", "E", "E",
"EH", "EH", "EH", "EH", "EH", "EH", "EH", "EX", "EX", "EX", "FK",
"FK", "FY", "G", "G", "G", "GL", "GL", "HA", "HD", "HD", "IV",
"KA", "KA", "KA", "KA", "KA", "KA", "KA", "KA", "KA", "KA", "KA",
"KA", "KA", "KA", "KA", "KA", "KA", "KT", "KY", "KY", "KY", "L",
"L", "L", "LA", "LA", "LE", "LE", "M", "M", "ME", "ME", "MK",
"ML", "N", "N", "N", "NE", "NG", "NN", "NN", "NR", "NW", "OL",
"OX", "OX", "PH", "PO", "PR", "RG", "RH", "RM", "RM", "S", "S",
"S", "S", "SA", "SE", "SE", "SE", "SE", "SE", "SG", "SL", "SN",
"SN", "SO", "SO", "SO", "SS", "ST", "ST", "ST", "ST", "SW", "SW",
"SW", "SW", "SY", "SY", "SY", "TA", "TD", "TD", "TN", "TW", "UB",
"UB", "W", "W", "W", "W", "WA", "WC", "WD")), class = "data.frame", row.names = c(NA,
-156L))
will_can_region_norm
structure(list(norm = c(67.3112073766083, 0, 62.9924341677094,
0, 134.940019161483, 86.0271073135687, 233.710968710152, 0, 0,
136.210220315945, 72.0106074505199, 54.9624828839958, 0, 0, 46.5061888459603,
0, 51.9149234846709, 85.3970454501009, 0, 0, 141.438961332615,
122.50716299382, 197.887432921107, 96.646567080111, 108.996678489718,
873.779493880704, 0, 109.106806944561, 56.7421763178016, 249.99781251914,
0, 106.993398828272, 0, 182.997053590583, 0, 225.716259764203,
217.655353412983, 98.8344746903195, 70.3435951664196, 106.870878390986,
0, 0, 113.255439262354, 226.344150395729, 0, 0, 0, 0, 0, 0, 0,
0, 0, 92.5698187029358, 0, 1159.88543061088, 59.5746039659052,
0, 217.977759293264, 88.627745595238, 155.299651064979, 0, 70.3301130229532,
0, 0, 0, 0, 36.166169734453, 162.12380892704, 74.7710230881704,
112.29824076945, 120.249189991435, 25.6209421071498, 36.7120335621411,
115.238964414265, 0, 50.4621322067494, 59.9490876378327, 82.9160720202368,
132.342362545417, 0, 0, 209.987774511768, 0, 45.0104437732687,
59.5244437425851, 54.7420581590574, 77.921490980977, 132.545922191567,
100.083647410414, 51.5757713324224, 102.602449571922, 98.8984492920948,
0, 129.885834248271, 0, 189.332549749021, 149.846130500895, 0,
0, 73.4653456617979, 220.103517986062, 111.317004279081, 375.711503660056,
156.229153172374, 760.35739839154, 0, 83.1515916711375, 0, 0,
0, 73.5483180088058, 269.518568414391, 102.141462145838, 55.2886923953334,
151.949727736478, 148.297412239816, 0, 0, 0, 0, 0, 0, 0), REGION = c("AB",
"AL", "B", "BA", "BB", "BD", "BH", "BL", "BN", "BR", "BS", "BT",
"CA", "CB", "CF", "CH", "CM", "CO", "CR", "CT", "CV", "CW", "DA",
"DD", "DE", "DG", "DH", "DL", "DN", "DT", "DY", "E", "EC", "EH",
"EN", "EX", "FK", "FY", "G", "GL", "GU", "GY", "HA", "HD", "HG",
"HP", "HR", "HS", "HU", "HX", "IG", "IM", "IP", "IV", "JE", "KA",
"KT", "KW", "KY", "L", "LA", "LD", "LE", "LL", "LN", "LS", "LU",
"M", "ME", "MK", "ML", "N", "NE", "NG", "NN", "NP", "NR", "NW",
"OL", "OX", "PA", "PE", "PH", "PL", "PO", "PR", "RG", "RH", "RM",
"S", "SA", "SE", "SG", "SK", "SL", "SM", "SN", "SO", "SP", "SR",
"SS", "ST", "SW", "SY", "TA", "TD", "TF", "TN", "TQ", "TR", "TS",
"TW", "UB", "W", "WA", "WC", "WD", "WF", "WN", "WR", "WS", "WV",
"YO", "ZE")), row.names = c(NA, -124L), class = "data.frame")
I am trying to add a new column, will_can$norm, which would contain values of will_can_region_norm$norm based on matching values of the variable "REGION" which is the same in both datasets. So gaps from the second dataset to the first would be automatically filled based on matching strings of REGION
Based on another question in the forum I tried the following function:
will_can2 <- merge(will_can, will_can_region_norm[,"norm"], by = "REGION", all=TRUE)
But I get the following error:
Error in fix.by(by.y, y) :
'by' must specify a unique correct column [translated from French]
Is there something I'm missing here? Would be grateful for some help !
Cameron
For your merge(will_can, will_can_region_norm[,"norm"], by = "REGION", all=TRUE) command to work, both data.frames would need a column called REGION. In your example:
will_can doesn't have this column, but it does have one called will_can.REGION.
You've extracted a single column from will_can_norm called norm, and tried to merge based on that single column. Unfortunately, the merge() command never sees the REGION column of will_can_norm.
In your case, try something like
merge(will_can, will_can_region_norm, by.x = "will_can.REGION", by.y="REGION", all=TRUE)

ggplot density plot: Different x-axis for each group

I am trying to plot multiple density plots for some data.
I have the following code:
ggplot(data=stack) +
geom_density(aes(x=OfferPrice, group=Country, fill=Country),
alpha=0.5, adjust=2) +
facet_grid(~Country) +
theme_bw()
My problem arises with the x-axis on each of the density plots.
Looking a little closer at the data:
stack %>%
group_by(Country) %>%
summarise(min(OfferPrice),
mean(OfferPrice),
max(OfferPrice))
It looks like Country - JN has some very high numbers. Earlier I cut the top and bottom 5% of extreme values for each group so there shouldn´t necessarily be extreme values. What I think is wrong is the x-axis on the plots are using the maximum from the JN group. How is it possible to have different x-axis depending on each group?
Data:
stack <- structure(list(Country = c("US", "GB", "US", "HK", "JN", "US",
"CH", "CA", "US", "US", "CA", "JN", "GB", "AU", "US", "GB", "US",
"GB", "HK", "CH", "CA", "CA", "US", "GB", "TA", "JN", "CA", "CA",
"CA", "CA", "CH", "GB", "CA", "HK", "CA", "US", "US", "CA", "US",
"AU", "CA", "CA", "CA", "US", "GB", "GB", "AU", "US", "US", "AU",
"CA", "CA", "CA", "US", "CA", "GB", "CA", "US", "GB", "US", "AU",
"AU", "US", "CA", "US", "GB", "AU", "CH", "CA", "CA", "GB", "AU",
"AU", "CH", "CA", "AU", "CH", "US", "CH", "TA", "AU", "AU", "GB",
"CH", "HK", "AU", "AU", "CA", "US", "TA", "GB", "US", "AU", "US",
"CA", "CA", "US", "AU", "CA", "US", "CA", "US", "CA", "US", "CA",
"US", "US", "AU", "CA", "AU", "GB", "US", "HK", "AU", "US", "CA",
"JN", "JN", "GB", "JN", "CA", "CA", "AU", "GB", "GB", "US", "US",
"US", "AU", "GB", "CA", "CA", "US", "CH", "GB", "US", "US", "AU",
"GB", "CH", "JN", "CA", "AU", "CA", "US", "US", "AU", "AU", "CA",
"US", "GB", "GB", "US", "US", "CA", "US", "HK", "AU", "US", "GB",
"US", "GB", "GB", "US", "CA", "JN", "CA", "AU", "CA", "CA", "GB",
"CA", "HK", "HK", "US", "CH", "US", "AU", "TA", "US", "CH", "HK",
"AU", "US", "HK", "GB", "AU", "CH", "US", "AU", "US", "CH", "US",
"CH", "CA", "AU", "HK", "CA", "US", "CH", "GB", "CA", "CA", "CA",
"CA", "US", "CA", "CA", "US", "HK", "US", "HK", "AU", "GB", "AU",
"CH", "US", "AU", "CA", "CA", "US", "GB", "AU", "US", "CH", "CA",
"CA", "CA", "US", "AU", "GB", "GB", "CA", "AU", "CA", "AU", "US",
"HK", "AU", "US", "AU", "CA", "US", "US", "US", "CA", "GB", "CA",
"US", "CA", "US", "AU", "CA", "US", "AU", "CH", "GB", "CA", "CA",
"CA", "CA", "HK", "AU", "TA", "AU", "GB", "AU", "CA", "JN", "US",
"CA", "CA", "AU", "US", "US", "GB", "CA", "US", "GB", "US", "CA",
"CA", "CH", "US", "US", "US", "US", "US", "HK", "CH", "CA", "CA",
"CA", "AU", "GB", "CH", "CA", "GB", "CA", "AU"), EffectiveDate = structure(c(17617,
17500, 17556, 17596, 17618, 17667, 17786, 17728, 17569, 17760,
17585, 17613, 17806, 17847, 17786, 17665, 17702, 17683, 17574,
17725, 17723, 17658, 17563, 17847, 17584, 17578, 17842, 17522,
17697, 17521, 17680, 17794, 17813, 17697, 17773, 17578, 17556,
17595, 17669, 17501, 17655, 17562, 17668, 17589, 17582, 17780,
17812, 17667, 17611, 17758, 17751, 17617, 17505, 17505, 17725,
17506, 17513, 17541, 17644, 17702, 17828, 17688, 17696, 17519,
17850, 17746, 17779, 17547, 17806, 17701, 17638, 17759, 17786,
17770, 17835, 17750, 17841, 17653, 17788, 17553, 17794, 17654,
17724, 17675, 17802, 17638, 17625, 17514, 17709, 17758, 17681,
17507, 17708, 17758, 17800, 17569, 17821, 17493, 17834, 17848,
17639, 17549, 17590, 17575, 17630, 17639, 17760, 17724, 17701,
17725, 17756, 17794, 17511, 17494, 17822, 17758, 17709, 17492,
17605, 17709, 17602, 17644, 17814, 17696, 17760, 17603, 17730,
17675, 17590, 17724, 17743, 17680, 17690, 17711, 17560, 17570,
17702, 17512, 17626, 17618, 17576, 17731, 17527, 17709, 17729,
17672, 17515, 17527, 17641, 17597, 17812, 17540, 17602, 17743,
17662, 17709, 17505, 17737, 17603, 17515, 17568, 17843, 17738,
17508, 17816, 17578, 17590, 17812, 17743, 17528, 17616, 17687,
17646, 17515, 17816, 17557, 17507, 17564, 17777, 17802, 17511,
17842, 17584, 17556, 17547, 17844, 17590, 17674, 17759, 17583,
17836, 17721, 17724, 17801, 17578, 17808, 17682, 17849, 17708,
17515, 17746, 17633, 17759, 17591, 17802, 17540, 17560, 17588,
17800, 17787, 17821, 17724, 17645, 17527, 17722, 17556, 17704,
17844, 17619, 17792, 17577, 17637, 17843, 17765, 17688, 17562,
17834, 17738, 17653, 17645, 17718, 17676, 17637, 17570, 17490,
17534, 17646, 17625, 17766, 17808, 17675, 17786, 17808, 17555,
17739, 17802, 17617, 17619, 17667, 17634, 17662, 17711, 17806,
17513, 17627, 17673, 17574, 17647, 17609, 17619, 17521, 17543,
17686, 17807, 17613, 17543, 17543, 17528, 17694, 17576, 17584,
17521, 17605, 17618, 17723, 17641, 17683, 17823, 17634, 17844,
17836, 17816, 17539, 17583, 17618, 17687, 17589, 17602, 17717,
17535, 17718, 17625, 17822, 17651, 17521, 17751, 17617, 17563,
17578, 17772), class = "Date"), OfferPrice = c(44, 13, 33, 0.3,
3000, 23, 6.26, 0.35, 10, 6.25, 0.25, 7110, 109.5, 0.11, 16,
2, 5.5, 15, 0.5, 8.5, 0.5, 0.2, 5, 92.5, 22, 103740, 0.23, 0.75,
8.65, 1.23, 17.4, 1.5, 0.38, 0.84, 8.1, 27.5, 10, 1, 14, 0.42,
0.1, 1.82, 2, 39.8, 238, 340, 0.3, 4.5, 41.5, 0.2, 0.25, 0.27,
0.35, 5, 0.3, 115, 0.15, 5, 142.25, 14, 2.43, 0.02, 24, 0.115,
8.25, 25, 0.155, 9.6, 0.67, 6, 52.5, 0.2, 0.2, 1.37, 1.6, 0.65,
2.9, 4, 7, 72, 0.025, 0.14, 22.5, 6.75, 0.64, 0.8, 0.8, 0.4,
22, 94.8, 15, 10, 2.45, 34, 1, 9.3, 6.25, 0.018, 0.2, 24.5, 0.3,
2.9, 0.35, 2.05, 0.4, 29.5, 2.26, 0.36, 0.75, 0.027, 2.8, 16,
3.54, 0.018, 10, 0.15, 1780, 1602, 120, 3900, 0.25, 0.18, 2.32,
269, 175, 18, 18, 23, 0.2, 10, 0.1, 0.6, 4.8, 6, 164.5, 7, 26.42,
0.02, 190, 11, 992, 0.2, 1.42, 0.55, 23, 33.5, 0.35, 0.065, 1.16,
29.5, 65, 8, 27.5, 18.9, 0.15, 17, 0.63, 0.34, 26.25, 0.65, 6.9,
10, 6.75, 1.21, 0.95, 73125, 2.5961, 0.054, 1.2, 9.64, 251, 2.46,
0.18, 0.375, 9.97, 20.43, 25.5, 0.025, 60, 3, 1.55, 0.5, 0.2,
17, 0.443, 8, 0.05, 5.25, 1.15, 0.45, 7.155, 17, 24.5, 12.5,
2.1, 0.75, 0.35, 0.39, 38.2, 0.63, 16, 0.15, 0.1, 0.12, 2.32,
10, 0.3, 1.66, 17, 0.4, 1.3, 0.3, 1.08, 30, 0.8, 10.88, 0.9,
0.21, 0.17, 1.7, 2.25, 1, 0.08, 1.5, 14.75, 0.35, 0.44, 0.35,
17, 2, 37, 195, 0.165, 0.02, 0.2, 0.015, 25, 1.09, 0.45, 10,
0.145, 0.92, 36, 13.25, 4, 0.6, 101, 0.7, 15, 0.3, 0.7, 0.06,
0.25, 6.5, 1.1, 16.72, 1.25, 0.1, 0.12, 0.28, 1.18, 0.4, 0.02,
75, 0.08, 5, 0.02, 0.5, 2878, 8, 0.15, 0.33, 0.1, 21.25, 1, 12,
5.83, 4.25, 1.65, 3.15, 3, 0.15, 7.4, 31.25, 12, 24, 19.75, 41.5,
0.88, 13.65, 0.25, 0.15, 0.25, 2.35, 101, 1.26, 1.65, 10, 1.32,
0.5)), row.names = c(NA, -300L), class = c("tbl_df", "tbl", "data.frame"
))
You can specify whether you want free scales (x, y or both) in facet_grid.
ggplot(data=stack) +
geom_density(aes(x=OfferPrice, group=Country, fill=Country),
alpha=0.5, adjust=2) +
facet_grid(~Country, scales = "free_x") +
theme_bw()

R - d3heatmap - implement breaks

I am trying to plot a heatmap using the d3heatmap package.
Unfortunately, I have not been successful yet in implementing certain breaks using the option breaks=... as in heatmap or heatmap.2.
This yields just funny results, I am not even sure whether I am doing something wrong or whether the function just ignores breaks.
For example, I tried:
breaks = c(seq(-10, -2), seq(-2, -1.65), seq(-1.65, 1.65), seq(1.65, 2), seq(2, 10)
and
breaks = c(-10, -2, -1.65, 1.65, 2, 10)
with
colors = c("red", "yellow", "green", "yellow", "red")
but nothing seems to work properly.
Any suggestions?
Here's the dput of my data:
> dput(mat)
structure(c(-0.04, NA, 0.59, NA, 0.675, 0.96, 1.09, 0.445, NA,
0.545, NA, NA, 0.09, -1.11, NA, 0.99, 0.13, 0.215, 1.425, 0,
NA, 0.69, 0.805, NA, 0.69, 1.22, NA, 0.3, NA, 0.025, NA, 0.075,
0.36, -0.94, NA, -0.31, 0.26, 1.02, -1.19, NA, NA, -0.77, NA,
-1.48, 1.05, 0.48, NA, NA, NA, 1.49, -1.285, NA, 0.76, 1.14,
-0.62, NA, NA, NA, 0.95, NA, NA, -0.12, 0.49, NA, 2.31, NA, -0.33,
0.85, NA, -1.7, -1.63, NA, -1.12, 0.135, -0.18, NA, -0.245, NA,
-0.2, -0.2, 0.23, -0.11, NA, 0.3, -0.81, 0.04, 0.18, -0.7, 0.53,
0.44, -0.49, 0.28, 0.26, 0.06, 0.265, 0.21, 0.06, -0.175, 0.365,
0.255, 1.25, -0.35, 0.16, 0.125, 0.825, 0.08, 0.02, -0.02, 0.99,
0.79, -0.23, 0.06, NA, 0.36, -0.64, -0.195, 1.19, -0.29, 0.915,
NA, NA, NA, NA, 0.2, 0.1, NA, 0.04, 0.33, NA, 1.46, 2.36, NA,
-0.92, 1.295, NA, NA, 0.8, NA, 1.09, 1.45, 5.42, NA, NA, NA,
1.69, 3.43, NA, 0.55), .Dim = c(37L, 4L), .Dimnames = list(c("AT",
"BE", "BG", "CEE", "CH", "CN", "CZ", "DE", "DK", "EA", "EE",
"EMU", "ES", "EU", "FI", "FR", "GB", "GR", "HR", "HU", "IE",
"IT", "JP", "LU", "NL", "PL", "PT", "RO", "RS", "RU", "SE", "SI",
"SK", "TR", "UA", "UK", "US"), c("Credit Risk", "Funding and liquidity Risk",
"Macro Risk", "Market Risk")))
And the code I am running:
d3heatmap(abs(mat),
dendrogram = "none",
breaks = c(0,1.65,2,10),
col = c("green", "yellow", "red"),
na.rm = TRUE)
The same function using heatmap.2 works perfectly, though.
The function d3heatmap simply does not have a 'breaks' argument. If it gets passed in as an argument it is silently ignored. (See ?d3heatmap.)
The heatmap.2 function in the gplots package on the other hand does have a "breaks" argument. That explains the difference in behaviour.
Luckily, it is still possible to get the desired behaviour by passing an appropriate 'colors' function to d3heatmap. It works as follows.
First the example data:
mat <- structure(c(-0.04, NA, 0.59, NA, 0.675, 0.96, 1.09, 0.445, NA,
0.545, NA, NA, 0.09, -1.11, NA, 0.99, 0.13, 0.215, 1.425, 0,
NA, 0.69, 0.805, NA, 0.69, 1.22, NA, 0.3, NA, 0.025, NA, 0.075,
0.36, -0.94, NA, -0.31, 0.26, 1.02, -1.19, NA, NA, -0.77, NA,
-1.48, 1.05, 0.48, NA, NA, NA, 1.49, -1.285, NA, 0.76, 1.14,
-0.62, NA, NA, NA, 0.95, NA, NA, -0.12, 0.49, NA, 2.31, NA, -0.33,
0.85, NA, -1.7, -1.63, NA, -1.12, 0.135, -0.18, NA, -0.245, NA,
-0.2, -0.2, 0.23, -0.11, NA, 0.3, -0.81, 0.04, 0.18, -0.7, 0.53,
0.44, -0.49, 0.28, 0.26, 0.06, 0.265, 0.21, 0.06, -0.175, 0.365,
0.255, 1.25, -0.35, 0.16, 0.125, 0.825, 0.08, 0.02, -0.02, 0.99,
0.79, -0.23, 0.06, NA, 0.36, -0.64, -0.195, 1.19, -0.29, 0.915,
NA, NA, NA, NA, 0.2, 0.1, NA, 0.04, 0.33, NA, 1.46, 2.36, NA,
-0.92, 1.295, NA, NA, 0.8, NA, 1.09, 1.45, 5.42, NA, NA, NA,
1.69, 3.43, NA, 0.55), .Dim = c(37L, 4L),
.Dimnames = list(c("AT", "BE", "BG", "CEE", "CH", "CN", "CZ", "DE", "DK", "EA", "EE", "EMU", "ES", "EU", "FI", "FR", "GB", "GR", "HR", "HU", "IE", "IT", "JP", "LU", "NL", "PL", "PT", "RO", "RS", "RU", "SE", "SI", "SK", "TR", "UA", "UK", "US"), c("Credit Risk", "Funding and liquidity Risk", "Macro Risk", "Market Risk")))
Suppose we want the following three color bins: blue for values < 0, green for values >= 0 but < 2, and red for values >= 2. We then define the corresponding ordered list of colors.
palette <- c("blue", "green", "red")
We also define the boundary values of the color bins. These values must include the domain boundaries.
mi <- min(mat, na.rm = TRUE)
ma <- max(mat, na.rm = TRUE)
breaks <- c(mi, 0, 2, ma)
We can now define a color interpolation function which maps a value in [0,1] onto a color, respecting our color bins. The 'scales' package comes to help here.
install.package('scales') # if needed
library(scales)
colorFunc <- col_bin(palette, bins = rescale(breaks))
The breaks originally defined in the domain of our data needed to be rescaled to [0,1]. The 'rescale' function in the 'scales' package handled that.
Small detail: the low boundary of a bin is included in the bin, but the high boundary is excluded. So the value 0 will be green, anything between 0 and 2 will be green too, but 2 will be red.
We can now plot the heat map.
d3heatmap(mat, dendrogram = "none", colors = colorFunc, na.rm = TRUE)
The result looks like this:

Resources