Related
I am making a plot to show the relationship between house size and prices. The thing is, I need the 5% of the most recently built houses to have a different color and symbol on the plot.
Here is my code (new_baltimore is the dataframe):
y <- new_baltimore$AGE
quantile(y, 0.05) #the result is 4
k <- subset(new_baltimore, y<=4)
kk <- k$SQFT
col = ifelse(any new_baltimore$SQFT %in% kk, "red", "green")
pch = ifelse(any new_baltimore$SQFT %in% kk, 25, 20)
plot(new_baltimore$SQFT, PRICE, col=col, pch=pch)
R gives me the errors
Error: unexpected symbol in "col = ifelse(any new_baltimore"
Error: unexpected symbol in "pch = ifelse(any new_baltimore"
Any help?
edit: This is the reproducible data:
baltimore_struct <-
structure(
list(
new_baltimore.SQFT = c(
11.25,
28.92,
30.62,
26.12,
22.04,
39.42,
21.88,
25.6,
44.12,
19.88,
12.08,
10.99,
12.8,
29.79,
14.3,
13.72,
11.84,
18.06,
10.72,
8.96,
14.38,
36.75,
20,
22.82,
24.86,
19.2,
11.58,
26,
14.4,
11.62,
23.08,
23.76,
15.6,
10,
22.8,
16.76,
22.1,
14.28,
15.36,
16,
23.04,
24.94,
11.82,
12.88,
11.2,
18.12,
38.25,
17.68,
19.02,
32.8,
15.16,
21.975,
12.6,
23.52,
17.52,
47.61,
20.55,
35.52,
8.4,
13.68,
14.48,
12.8,
12.8,
18,
15.4,
10.08,
8.96,
8.96,
20,
12.88,
12,
18.16,
14.28,
26,
12.02,
20.8,
11.78,
8.68,
17.6,
11.4,
44.55,
46.32,
10.24,
9.6,
31.2,
26.4,
13.6,
27.48,
17.86,
18.04,
14.84,
10.46,
14.56,
6.96,
9.5,
11.86,
12.88,
12.32,
6.72,
10.08,
15.6,
6.72,
11.52,
11.76,
10.24,
11.52,
9.28,
6.72,
15.6,
15.5,
9.84,
15.6,
13.76,
10.24,
5.76,
10.08,
11.52,
12.15,
9.77,
15,
14.4,
14.5,
22.54,
10.24,
7.8,
8.4,
10.92,
42.9,
9,
10.5,
10.08,
12.6,
8.96,
8.58,
7.56,
10.8,
13.44,
10.24,
14.44,
12.24,
13.2,
9.6,
15.22,
24.16,
10.24,
10.24,
9.88,
23.2,
17.68,
24.3,
35.94,
21.6,
11.02,
21,
23.92,
14.4,
28,
11.44,
21.94,
10.24,
16.86,
9.92,
13.44,
12,
14.76,
8.96,
11.52,
8.64,
8.12,
11.12,
11.28,
10.36,
11.52,
17.1,
17.52,
10.73,
11.2,
12.8,
12,
41.07,
12.8,
22.36,
10.56,
13.44,
11.02,
17.98,
18.88,
11.76,
9.36,
11.52,
27.3,
23.04,
17.68,
13.36,
11.6,
11.52,
9.98,
12.96,
11.13,
19.6,
11.52,
12.16,
0,
10.64
),
PRICE2 = c(
47,
113,
165,
104.3,
62.5,
70,
127.5,
64.5,
145,
63.5,
58.9,
65,
48,
3.5,
12.8,
17.5,
36,
41.9,
53.5,
24.5,
24.5,
55.5,
60,
51,
46,
46,
44,
54.9,
42.5,
44,
44.9,
37.9,
33,
43.9,
49.6,
52,
37.5,
50,
35.9,
42.9,
107,
112,
44.9,
55,
102,
35.5,
62.9,
39,
110,
8,
62,
85.9,
57,
110,
67.7,
89.5,
70,
74,
13,
48,
24,
53.5,
34.5,
53,
87.5,
33.5,
24,
9.6,
30,
41,
30,
38.9,
20.7,
49.9,
18.6,
39,
34,
16,
18.9,
15.2,
41.5,
53,
22,
24.9,
6.7,
32.5,
30,
59,
29.5,
26,
16.5,
39,
48.9,
33.5,
46,
54,
57.9,
37.9,
32,
31,
34,
29,
32.5,
51.9,
31,
41.8,
48,
28,
35,
46.5,
51.9,
35.4,
16,
35,
35,
36.5,
35.9,
45,
40,
35,
38,
37,
23,
25.5,
39.5,
21.5,
9,
67.5,
13.4,
12.5,
28.5,
23,
33.5,
9,
11,
30.9,
31.65,
33,
33.4,
47,
40,
46,
45.5,
57,
29.9,
30,
34,
51,
64.5,
57.5,
85.5,
61,
38,
56.5,
60.4,
51.5,
54,
69,
56,
27.9,
37.5,
32.9,
22,
29.9,
39.9,
32.6,
38.5,
21.5,
25.9,
27.5,
22.9,
31.5,
8.5,
5.5,
33,
57,
47,
43.5,
43.9,
68.5,
44.25,
61,
40,
44.5,
57,
35,
35.1,
64.5,
40,
42.6,
50,
58,
58,
55,
43,
54,
39,
45,
42,
38.9,
43.215,
26.5,
30,
29.5
)
),
row.names = c(
1L,
2L,
3L,
4L,
5L,
6L,
7L,
8L,
9L,
10L,
11L,
12L,
13L,
14L,
15L,
16L,
17L,
18L,
19L,
20L,
21L,
22L,
23L,
24L,
25L,
26L,
27L,
28L,
29L,
30L,
31L,
32L,
33L,
34L,
35L,
36L,
37L,
38L,
39L,
40L,
41L,
42L,
43L,
44L,
45L,
46L,
47L,
48L,
49L,
50L,
51L,
53L,
54L,
55L,
56L,
57L,
58L,
59L,
60L,
61L,
62L,
63L,
64L,
65L,
66L,
67L,
68L,
69L,
70L,
71L,
72L,
73L,
74L,
75L,
76L,
77L,
78L,
79L,
80L,
81L,
82L,
83L,
84L,
85L,
86L,
87L,
88L,
89L,
90L,
91L,
92L,
93L,
94L,
95L,
96L,
97L,
98L,
99L,
100L,
101L,
102L,
103L,
104L,
105L,
106L,
107L,
108L,
109L,
110L,
111L,
112L,
113L,
114L,
115L,
116L,
117L,
118L,
119L,
120L,
121L,
122L,
123L,
124L,
125L,
126L,
127L,
128L,
129L,
130L,
131L,
132L,
133L,
134L,
135L,
136L,
137L,
138L,
139L,
140L,
141L,
142L,
143L,
144L,
145L,
146L,
147L,
148L,
149L,
150L,
151L,
152L,
153L,
154L,
155L,
156L,
157L,
158L,
159L,
160L,
161L,
162L,
163L,
164L,
165L,
166L,
167L,
168L,
169L,
170L,
171L,
172L,
173L,
174L,
175L,
176L,
177L,
178L,
179L,
180L,
181L,
182L,
183L,
184L,
185L,
186L,
187L,
188L,
189L,
190L,
191L,
192L,
193L,
194L,
195L,
196L,
197L,
198L,
199L,
200L,
201L,
202L,
203L,
204L,
205L
),
class = "data.frame"
)
edit2: I found the error, I just had to remove the any in the ifelse commands. So the correct code looks like this
col = ifelse(new_baltimore$SQFT %in% kk, "red", "green")
pch = ifelse(new_baltimore$SQFT %in% kk, 25, 20)
next time, please try to provide some reproducible data. It makes it easier for others to help you. You can find more information here.
I tried to generate some data on my own. Hope it fits your data. I recommend using the tidyverse as it contains a lot of useful packages for manipulating and visualising data. library(tidyverse) loads all packages from the tidyverse but you can also load only the necessary packages such as dplyr for data manipulation or ggplot2 for data visualisation. See comments for further details:
#install.packages("tidyverse")
library(tidyverse)
# generate some data
data <- data.frame(
age = sample(c(10:50), 100, replace=TRUE),
price = sample(c(50000:1000000), 100, replace=TRUE),
size = sample(c(200:500), 100, replace=TRUE)
)
# save threshold
threshold <- quantile(data$age, 0.05)
plot_data <- data %>%
mutate(groups = factor(ifelse(age<threshold, "newer", "older"))) # mutate generates a new variable in the data and you can define groups based on conditions
ggplot(plot_data,
aes(x=size, y=price, group=groups, color=groups, shape=groups)) + # here you specify the mappings of the aesthetics and group the data depending on the groups variable that corresponds to our threshold
geom_point(size=4) +
scale_color_manual(values = c("aquamarine4","burlywood")) + # control color aesthetic
scale_shape_manual(values = c(18, 19)) + # control shape aesthetic
theme_classic() # this one of a lot of predefined themes
Hope that helps.
I tried to cluster my dataset using K-mean, but there is a categorical data in column 9; so when I ran k-mean it had an error like this:
res<-NbClust(mi[2:9],min.nc=2,max.nc=15,method="ward.D2")
Error in t(jeu) %*% jeu :
requires numeric/complex matrix/vector arguments
So I could only run K-mean for columns from 2 to 8. I wonder if there is another way of clustering the data where I could run with column 9 as well?
Data:
df <- structure(list(Name = structure(c(58L, 188L, 40L, 155L, 32L, 88L, 92L, 55L, 135L, 31L, 139L, 26L, 126L, 10L, 166L, 104L, 75L, 180L, 35L, 175L, 77L, 99L, 4L, 71L, 141L, 176L, 53L, 39L, 172L, 196L, 123L, 107L, 16L, 96L, 82L, 185L, 30L, 15L, 94L, 129L, 187L, 151L, 33L, 23L, 28L, 44L, 157L, 69L, 132L, 83L, 131L, 11L, 182L, 181L, 54L, 115L, 116L, 183L, 150L, 195L, 45L, 144L, 1L, 110L, 17L, 114L, 9L, 117L, 112L, 70L, 34L, 169L, 27L, 66L, 3L, 73L, 133L, 91L, 154L, 130L, 160L, 105L, 90L, 165L, 67L, 100L, 162L, 98L, 29L, 68L, 189L, 192L, 102L, 190L, 134L, 136L, 52L, 12L, 81L, 59L, 63L, 122L, 93L, 109L, 178L, 138L, 5L, 43L, 140L, 95L, 2L, 174L, 76L, 51L, 156L, 60L, 149L, 128L, 177L, 142L, 103L, 7L, 8L, 14L, 164L, 74L, 145L, 148L, 113L, 86L, 108L, 48L, 163L, 6L, 186L, 89L, 36L, 191L, 125L, 120L, 62L, 65L, 124L, 168L, 147L, 79L, 173L, 84L, 193L, 25L, 146L, 121L, 127L, 153L, 13L, 106L, 119L, 161L, 49L, 97L, 101L, 61L, 137L, 24L, 85L, 194L, 78L, 41L, 170L, 47L, 118L, 184L, 179L, 72L, 42L, 111L, 87L, 57L, 38L, 37L, 171L, 22L, 50L, 80L, 159L, 18L, 152L, 64L, 56L, 158L, 167L, 46L, 19L, 21L, 20L, 143L), .Label = c("#Mashtag 2013", "#Mashtag 2014", "#Mashtag 2015", "10 Heads High", "5am Saint", "77 Lager", "AB:02", "AB:03", "AB:04", "AB:06", "AB:08", "AB:10", "AB:11", "AB:13", "AB:15", "AB:17", "AB:18", "AB:20", "Ace Of Chinook", "Ace Of Citra", "Ace Of Equinox", "Ace Of Simcoe", "Albino Squid Assasin", "Alice Porter", "All Day Long - Prototype Challenge", "Alpha Dog", "Alpha Pop", "Amarillo - IPA Is Dead", "American Ale", "Anarchist Alchemist", "Arcade Nation", "Avery Brown Dredge", "Baby Dogma", "Baby Saison - B-Sides", "Bad Pixie", "Barley Wine - Russian Doll", "Barrel Aged Albino Squid Assassin", "Barrel Aged Hinterland", "Berliner Weisse With Raspberries And Rhubarb - B-Sides", "Berliner Weisse With Yuzu - B-Sides", "Bitch Please (w/ 3 Floyds)", "Black Dog", "Black Eye Joe (w/ Stone Brewing Co)", "Black Eyed King Imp", "Black Eyed King Imp - Vietnamese Coffee Edition", "Black Hammer", "Black Jacques", "Black Tokyo Horizon (w/Nøgne Ã\230 & Mikkeller)", "Blitz Berliner Weisse", "Blitz Series", "Born To Die", "Bounty Hunter - Shareholder Brew", "Bourbon Baby", "Bracken's Porter", "Bramling X", "Brewdog Vs Beavertown", "Brixton Porter", "Buzz", "Candy Kaiser", "Cap Dog (w/ Cap Brewery)", "Catherine's Pony (w/ Beavertown)", "Challenger", "Chaos Theory", "Chili Hammer", "Chinook - IPA Is Dead", "Citra", "Clown King", "Cocoa Psycho", "Coffee Imperial Stout", "Comet", "Dana - IPA Is Dead", "Dead Metaphor", "Dead Pony Club", "Deaf Mermaid - B-Sides", "Devine Rebel (w/ Mikkeller)", "Dog A", "Dog B", "Dog C", "Dog D", "Dog E", "Dog Fight (w/ Flying Dog)", "Dog Wired (w/8 Wired)", "Dogma", "Doodlebug", "Double IPA - Russian Doll", "Edge", "El Dorado - IPA Is Dead", "Electric India", "Ella - IPA Is Dead", "Elvis Juice V2.0 - Prototype Challenge", "Everday Anarchy", "Fake Lager", "Galaxy", "Goldings - IPA Is Dead", "Growler", "Hardcore IPA", "Hardkogt IPA", "HBC 366 - IPA Is Dead", "HBC 369", "Hello My Name Is Beastie", "Hello My Name Is Holy Moose", "Hello My Name Is Ingrid", "Hello My Name Is Little Ingrid", "Hello My Name Is Mette-Marit", "Hello My Name Is PaÌ\210ivi", "Hello My Name is Sonja (w/ Evil Twin)", "Hello My Name is Vladimir", "Hello My Name Is ZeÌ\201 (w/ 2Cabeças)", "Hinterland", "Hobo Pop", "Hop Fiction - Prototype Challenge", "Hopped-Up Brown Ale - Prototype Challenge", "Hoppy Christmas", "Hops Kill Nazis", "Hunter Foundation Pale Ale", "Hype", "India Session Lager - Prototype Challenge", "International Arms Race (w/ Flying Dog)", "Interstellar", "Jack Hammer", "Jasmine IPA", "Jet Black Heart", "Kohatu - IPA Is Dead", "Konnichiwa Kitsune", "Libertine Black Ale", "Libertine Porter", "Lichtenstein Pale Ale", "Lizard Bride - Prototype Challenge", "Lost Dog (w/Lost Abbey)", "Lumberjack Stout", "Magic Stone Dog (w/Magic Rock & Stone Brewing Co.)", "Mandarina Bavaria - IPA Is Dead", "Mango Gose - B-Sides", "Melon And Cucumber IPA - B-Sides", "Misspent Youth", "Morag's Mojito - B-Sides", "Moshi Moshi 15", "Motueka", "Movember", "Mr.Miyagi's Wasabi Stout", "Nanny State", "Nelson Sauvin", "Neon Overlord", "Never Mind The Anabolics", "No Label", "Nuns With Guns", "Old World India Pale Ale", "Old World Russian Imperial Stout", "Orange Blossom - B-Sides", "Pale - Russian Doll", "Paradox Islay", "Paradox Islay 2.0", "Paradox Jura", "Peroxide Punk", "Pilsen Lager", "Pioneer - IPA Is Dead", "Prototype 27", "Prototype Helles", "Prototype Pils 2.0", "Pumpkin King", "Punk IPA 2007 - 2010", "Punk IPA 2010 - Current", "Restorative Beverage For Invalids And Convalescents", "Rhubarb Saison - B-Sides", "Riptide", "Russian Doll â\200“ India Pale Ale", "Rye Hammer", "San Diego Scotch Ale (w/Ballast Point)", "Santa Paws", "Shareholder Black IPA 2011", "Ship Wreck", "Shipwrecker Circus (w/ Oskar Blues)", "Simcoe", "Sink The Bismarck!", "Skull Candy", "Sorachi Ace", "Sorachi Bitter - B-Sides", "Spiced Cherry Sour - B-Sides", "Stereo Wolf Stout - Prototype Challenge", "Storm", "Sub Hop", "Sunk Punk", "Sunmaid Stout", "Sunshine On Rye - B-Sides", "The Physics", "This. Is. Lager", "TM10", "Trashy Blonde", "Truffle and Chocolate Stout - B-Sides", "U-Boat (w/ Victory Brewing)", "Vagabond Pale ALe - Prototype Challenge", "Vagabond Pilsner", "Vic Secret", "Waimea - IPA Is Dead", "Whisky Sour - B-Sides", "Zephyr"), class = "factor"), ABV = c(4.5, 4.1, 4.2, 6.3, 7.2, NA, 4.7, 7.5, 7.3, 5.3, 4.5, 4.5, 6.1, 11.2, 6, 8.2, 12.5, 8, 4.7, 3.5, 15, 6.7, 7.8, 6.7, 0.5, 7.5, 5.8, 3.6, 10.5, 12.5, 7.2, 8.2, 10.7, 9.2, 7.1, 5, 16.5, 12.8, 6.7, 10, NA, 10, 4.5, 7.4, 7.2, 9.5, 9.2, 9, 7.2, 7.5, NA, 10.43, 7.1, 8, 5, 5.4, 4.1, 10.2, 4, 7, 12.7, 6.5, 7.5, 4.2, 11.8, 7.6, 15, 4.4, 6.3, 7.2, NA, 4.5, 4.5, 7.5, 10, 3.8, 6.4, NA, 4, 15.2, 5.4, 8.3, 6.5, 8, 12, 8.2, 5.6, 7.2, 6.3, 10, 5.6, 4.5, 8.2, 8.4, 6, 6.7, 6.5, 11.5, 8.5, 5.2, 7.1, 4.7, 6.7, 9, 6.5, 6.7, 5, 5.8, 7.5, 4.5, 9, 41, 15, 8.5, 7.2, 9, 3.8, 5.7, 6.3, 7.5, 4.4, 18, 10.5, 11.3, NA, 5.2, 4.5, 9.5, 7.2, 2.7, 6.4, 17.2, 8.5, 4.9, 4.7, 7.2, 10, 4.5, 7.2, 7.2, 6.7, 7.2, 4.4, 9, 7.5, 16.1, 6.7, 2.5, 7.4, 2.8, 4.2, 5.8, 5.2, 10, 12.8, 8.3, 6.5, 6, 3, 7.6, 5.5, 8.8, 5.2, 5.2, 8, 6.7, 15, 11.5, 7.1, NA, 7.5, 7.2, 5.2, 6.8, 5.5, 5.2, 6.7, 5, 9, 9.2, 13.8, 4.5, 3.2, 16.1, 4.7, 14.2, 13, 7.2, 9.2, 4.9, 7.2, 7.2, 4.5, 4.5, 4.5, 7.6), IBU = c(60, 41.5, 8, 55, 59, 38, 40, 75, 30, 60, 50, 42, 45, 150, 70, 70, 100, 60, 45, 33, 90, 67, 70, 70, 55, 75, 35, 8, 85, 125, 70, 70, 100, 125, 65, 47, 20.5, 50, 70, 35, 20, 55, 35, 65, 70, 85, 149, 65, 100, 30, 30, 65, 68, 35, 50, 35, 65, 50, 35, 20, 85, 35, 50, 50, 80, 70, 80, 35, 85, 70, 9, 35, 30, 70, 85, 35, 40, 45, 40, 20, 20, 70, 60, 45, 85, 42, 40, 70, 55, 85, 30, 55, 42, 50, 50, 40, 35, 80, 65, 45, 90, 45, 67, 85, 20, 67, 30, 40, 90, 38, 50, 1085, 90, 85, 100, 80, 20, 35, 130, 75, 35, 70, 14, 50, 25, 65, 25, 80, 70, 36, 50, 75, 100, 30, 37, 100, 80, 55, 50, 250, 67, 100, 70, 70, 80, 85, 70, 35, 70, 30, 25, 40, 50, 55, 70, 70, 55, 60, 8, 175, 35, 40, 45, 55, 85, 70, 90, 50, 80, 45, 0, 130, 55, 30, 60, 40, 70, 50, 85, 65, 60, 40, 8, 100, 25, 20, 100, 250, 50, 18, 250, 250, 40, 40, 40, 70), OG = c(1044, 1041.7, 1040, 1060, 1069, 1045, 1046, 1068, 1079, 1052, 1047, 1046, 1067, 1098, 1058, 1076, 1093, 1082, 1047, 1038, 1120, 1013, 1074, 1066, 1007, 1068, 1049, 1040, 1102, 1087, 1067, 1076, 1105, 1085, 1065, 1048.5, 1112, 1096, 1066, 1080, 1048, 1090, 1048, 1069, 1067, 1095, 1083, 1080, 1064, 1080, 1043, 1095, 1056, 1077, 1049, 1050, 1042, 1026, 1041, 1081, 1113.5, 1050, 1070, 1042, 1096, 1073, 1113, 1040, 1063, 1067, 1032, 1048, 1045, 1068, 1098, 1040, 1057, 1081, 1039, 1110, 1055, 1076, 1060, 1075, 1130, 1078, 1055, 1067, 1060, 1098, 1058, 1046, 1078, 1080, 1050, 1063, 1068, 1096, 1078, 1049, 1067, 1055, 1013, 1094, 1060, 1013, 1050, 1053, 1072, 1042.9, 1084, 1085, 1120, 1072, 1064, 1083, 1039, 1053, 1060, 1068, 1045, 1150, 1093, 1098, 1052, 1048, 1043, 1075, 1067, 1033, 1061, 1156, 1068, 1047, 1043, 1064, 1097, 1045, 1068, 1065, 1064, 1064, 1045, 1090, 1069, 1125, 1063, 1027, 1069, 1032.5, 1044, 1060, 1050, 1128, 1108, 1076, 1059, 1056, 1007, 1072, 1053, 1084, 1048, 1053, 1074, 1066, 1120, 1104, 1067, 1089, 1069, 1065, 1052, 1068, 1062, 1048, 1066, 1053, 1094, 1069, 1088, 1045, 1007, 1015, 1008, 1025, 1015, 1065, 1016, 1010, 1065, 1065, 1045, 1045, 1045, 1067), EBC = c(20, 15, 8, 30, 10, 15, 12, 22, 120, 200, 140, 62, 219, 70, 25, NA, 36, 12, 8, 50, 100, 19, 90, 30, 30, 30, 44, NA, 64, 40, 30, 16, 300, 40, 13, 65, 20, 111, 30, 80, 14, 300, 40, 60, 30, 250, 19.5, 97, 12, 46, 15, 23, 14, 15, 110, 11.5, 17, 197, 45, 12, 250, 23, 40, 30, 115, 59, 400, 12, 24, 30, 2, 44, 25, 30, 130, 25, 10, 15, 18, 158, 30, 30, 25, 240, 24, 90, 15, 30, 30, 30, 54, 25, 70, 200, 8, 15, 250, 115, 31.2, 45, 15, 200, 19, 400, NA, 19, 60, 177.3, 200, 18, 20, 40, 100, 15, 12, 180, 6, 25, 14, 30, 30, 57, NA, 164, 10, 16, 10, 195, 30, 57, 20, 128, 15, 12, 10, 12, 65, 20, 150, 15, 19, 12, 30, 190, 50, 400, 30, 10, 30, 42, 19, 35, 17, 300, 79, 30, 50, 17, 9, 40, 25, 190, 35, 165, 35, 30, 100, 38, 71, 15, 50, 14, 200, 86, 230, 13, 30, 200, 400, 60, 25, 18, 8, 500, 25, 67, 300, 15, 78.8, 13, 17, 104, 18, 18, 18, 20), PH = c(4.4, 4.4, 3.2, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.2, 5.2, 4.4, 4.4, 4.4, 5.2, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 3.2, 4.4, 4.4, 4.4, 4.4, 4.3, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.2, 4.4, 4.4, 4.2, 4.4, 4.4, 4.4, 4.4, 4.4, 4.5, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 5.2, 3.2, 5.2, 4.4, 4.4, 4.4, 5.2, 4.4, 4, 4.4, 4.2, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 3.5, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 5.2, 5.2, 4.2, 4.4, 4.4, 4.2, 4.4, 4.4, 4.4, 4.3, 3.2, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 5.2, 5.2, 4.4, 5.2, 4.4, 4.4, 4.4, 4.4, 4.4, 5.2, 5.2, 4.2, 4.5, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.2, 4.4, 4.4, 4.2, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.3, 4.4, 4.2, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 3.2, 4.4, 4.4, 4.5, 4.4, 5.2, 5.2, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 5.2, 5.2, 4.4, 4.4, 4.4, 4.4, 4.4, 4.3, 4.2, 4.4, 4.2, 3.2, 4.4, 4.2, 4, 4.4, 4.4, 4.2, 4.2, 4.4, 4.4, 4.2, 4.2, 4.2, 4.4), AttenuationLevel = c(75, 76, 83, 80, 67, 88.9, 78, 80.9, 74.7, 77, 74.5, 72.8, 70.1, 87, 79.3, 83, 68, 86, 79, 68.4, 98, 79.7, 79.7, 77.3, 28.6, 82.1, 90, 83, 102, 81.2, 82.1, 83, 76.2, 81.2, 85, 79.4, 100, 79.17, 77.3, 85, 89.6, 84.4, 72.9, 82.6, 82.1, 76.8, 83, 76, 84, 70, 81.4, 83.2, 82.1, 79.2, 79, 84, 76.2, 74.5, 75.6, 74, 76.8, 76, 81.4, 76.2, 79.2, 79.5, 84.1, 79.5, 82.6, 82.1, 88, 72.9, 75.6, 82.1, 79.6, 70, 87, 93.8, 76.9, 82, 74.6, 82.9, 83.3, 81.3, 102.3, 83.3, 78, 82.1, 80, 70, 74, 73.9, 83.3, 81.3, 87, 84, 70.6, 79.2, 84.6, 81.6, 80.6, 70, 79.7, 73.4, 87, 79.7, 76, 84.9, 79.2, 81, 82.1, 81.2, 98, 90.3, 84, 83.1, 87, 79.3, 83, 82.1, 73.3, 93.3, 80, 79.6, 87, 79, 79.1, 81.3, 82.1, 70.8, 80.3, 80.8, 95.6, 80.7, 83.7, 84, 79.4, 73.9, 78.6, 84.6, 79.7, 84, 82.9, 80, 82.6, 84, 81, 70.4, 82.6, 63.1, 72.7, 76.7, 80, 89, 81.5, 82.9, 81.4, 82.14, 82.5, 80.6, 79.3, 79.8, 77.1, 75.5, 82.4, 77.3, 98, 85, 79, 94.4, 81.1, 87, 73.1, 76.5, 67.7, 79.2, 77.3, 73.6, 73.4, 82.6, 83, 75.6, 78, 84, 75.6, 75.6, 84.4, 84.6, 81, 78.7, 84.6, 84.6, 75.6, 75.6, 75.6, 82), FermentationTempCelsius = c(19L, 18L, 21L, 9L, 10L, 22L, 10L, 19L, 19L, 19L, 19L, 22L, 18L, 17L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 18L, 19L, 19L, 19L, 19L, 21L, 21L, 21L, 19L, 21L, 21L, 21L, 9L, 19L, 20L, 21L, 19L, 19L, 22L, 21L, 19L, 18L, 19L, 18L, 19L, 19L, 19L, 12L, 23L, 21L, 10L, 9L, 19L, 19L, 19L, 21L, 19L, 19L, 18L, 18L, 21L, 19L, 20L, 20L, 21L, 10L, 19L, 19L, 21L, 19L, 19L, 19L, 21L, 19L, 20L, 23L, 19L, 21L, 19L, 21L, 19L, 20L, 21L, 21L, 19L, 19L, 19L, 21L, 19L, 9L, 22L, 14L, 20L, 19L, 19L, 20L, 18L, 14L, 19L, 19L, 19L, 21L, 20L, 19L, 19L, 19L, 21L, 10L, 21L, 21L, 19L, 18L, 19L, 21L, 20L, 17L, 20L, 19L, 19L, 22L, 19L, 20L, 20L, 19L, 15L, 19L, 19L, 19L, 19L, 21L, 21L, 10L, 12L, 19L, 21L, 19L, 19L, 21L, 19L, 19L, 20L, 21L, 22L, 21L, 99L, 19L, 19L, 22L, 16L, 19L, 19L, 21L, 18L, 21L, 19L, 19L, 19L, 21L, 17L, 21L, 19L, 19L, 19L, 19L, 19L, 21L, 19L, 23L, 19L, 20L, 19L, 19L, 19L, 19L, 19L, 19L, 21L, 18L, 21L, 19L, 21L, 21L, 12L, 21L, 21L, 21L, 21L, 12L, 21L, 21L, 19L, 19L, 19L, 21L), Yeast = structure(c(1L, 1L, 1L, 3L, 3L, 4L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 3L, 1L, 2L, 2L, 1L, 2L, 4L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 3L, 4L, 2L, 3L, 3L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 3L, 1L, 1L, 4L, 1L, 1L, 1L, 2L, 1L, 1L, 4L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 3L, 2L, 3L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 3L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 4L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 1L, 1L, 1L, 2L), .Label = c("Wyeast 1056 - American Ale", "Wyeast 1272 - American Ale II", "Wyeast 2007 - Pilsen Lager", "Wyeast 3711 - French Saison"), class = "factor")), class = "data.frame", row.names = c(NA, -196L))
df
To solve your specific issue, you can generate dummy variables to run your desired clustering.
One way to do it is using the dummy_columns() function from the fastDummies package.
library(fastDummies)
df_dummy <- dummy_columns(df, select_columns = "Yeast", remove_selected_columns = TRUE)
res <- NbClust(df_dummy[2:9], min.nc = 2, max.nc = 15, method = "ward.D2")
As noted in the comments, the better practices for conduncting clustering analysis are more questions for CrossValidated.
Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 1 year ago.
Improve this question
I ran multiple imputation to impute missing data for 2 variables of a data frame, then I got a new data frame (with 2 columns for 2 imputed variables).
Now, I want to replace the 2 columns in the original data frame with the two newly imputed columns from my new dataframe.
What should I do?
Original data frame
new data frame for imputed variables
This is the code I used. Only 2 columns in this data frame are missing data, so I only imputed those two. Is that ok? Can you please suggest me a better way?
library("mice")
imi<-mice(subset(data,select=c('ABV','EBC')),m=5,maxit=10)
Data
structure(list(Name = structure(c(58L, 188L, 40L, 155L, 32L,
88L, 92L, 55L, 135L, 31L, 139L, 26L, 126L, 10L, 166L, 104L, 75L,
180L, 35L, 175L, 77L, 99L, 4L, 71L, 141L, 176L, 53L, 39L, 172L,
196L, 123L, 107L, 16L, 96L, 82L, 185L, 30L, 15L, 94L, 129L, 187L,
151L, 33L, 23L, 28L, 44L, 157L, 69L, 132L, 83L, 131L, 11L, 182L,
181L, 54L, 115L, 116L, 183L, 150L, 195L, 45L, 144L, 1L, 110L,
17L, 114L, 9L, 117L, 112L, 70L, 34L, 169L, 27L, 66L, 3L, 73L,
133L, 91L, 154L, 130L, 160L, 105L, 90L, 165L, 67L, 100L, 162L,
98L, 29L, 68L, 189L, 192L, 102L, 190L, 134L, 136L, 52L, 12L,
81L, 59L, 63L, 122L, 93L, 109L, 178L, 138L, 5L, 43L, 140L, 95L,
2L, 174L, 76L, 51L, 156L, 60L, 149L, 128L, 177L, 142L, 103L,
7L, 8L, 14L, 164L, 74L, 145L, 148L, 113L, 86L, 108L, 48L, 163L,
6L, 186L, 89L, 36L, 191L, 125L, 120L, 62L, 65L, 124L, 168L, 147L,
79L, 173L, 84L, 193L, 25L, 146L, 121L, 127L, 153L, 13L, 106L,
119L, 161L, 49L, 97L, 101L, 61L, 137L, 24L, 85L, 194L, 78L, 41L,
170L, 47L, 118L, 184L, 179L, 72L, 42L, 111L, 87L, 57L, 38L, 37L,
171L, 22L, 50L, 80L, 159L, 18L, 152L, 64L, 56L, 158L, 167L, 46L,
19L, 21L, 20L, 143L), .Label = c("#Mashtag 2013", "#Mashtag 2014",
"#Mashtag 2015", "10 Heads High", "5am Saint", "77 Lager", "AB:02",
"AB:03", "AB:04", "AB:06", "AB:08", "AB:10", "AB:11", "AB:13",
"AB:15", "AB:17", "AB:18", "AB:20", "Ace Of Chinook", "Ace Of Citra",
"Ace Of Equinox", "Ace Of Simcoe", "Albino Squid Assasin", "Alice Porter",
"All Day Long - Prototype Challenge", "Alpha Dog", "Alpha Pop",
"Amarillo - IPA Is Dead", "American Ale", "Anarchist Alchemist",
"Arcade Nation", "Avery Brown Dredge", "Baby Dogma", "Baby Saison - B-Sides",
"Bad Pixie", "Barley Wine - Russian Doll", "Barrel Aged Albino Squid Assassin",
"Barrel Aged Hinterland", "Berliner Weisse With Raspberries And Rhubarb - B-Sides",
"Berliner Weisse With Yuzu - B-Sides", "Bitch Please (w/ 3 Floyds)",
"Black Dog", "Black Eye Joe (w/ Stone Brewing Co)", "Black Eyed King Imp",
"Black Eyed King Imp - Vietnamese Coffee Edition", "Black Hammer",
"Black Jacques", "Black Tokyo Horizon (w/Nøgne Ã\230 & Mikkeller)",
"Blitz Berliner Weisse", "Blitz Series", "Born To Die", "Bounty Hunter - Shareholder Brew",
"Bourbon Baby", "Bracken's Porter", "Bramling X", "Brewdog Vs Beavertown",
"Brixton Porter", "Buzz", "Candy Kaiser", "Cap Dog (w/ Cap Brewery)",
"Catherine's Pony (w/ Beavertown)", "Challenger", "Chaos Theory",
"Chili Hammer", "Chinook - IPA Is Dead", "Citra", "Clown King",
"Cocoa Psycho", "Coffee Imperial Stout", "Comet", "Dana - IPA Is Dead",
"Dead Metaphor", "Dead Pony Club", "Deaf Mermaid - B-Sides",
"Devine Rebel (w/ Mikkeller)", "Dog A", "Dog B", "Dog C", "Dog D",
"Dog E", "Dog Fight (w/ Flying Dog)", "Dog Wired (w/8 Wired)",
"Dogma", "Doodlebug", "Double IPA - Russian Doll", "Edge", "El Dorado - IPA Is Dead",
"Electric India", "Ella - IPA Is Dead", "Elvis Juice V2.0 - Prototype Challenge",
"Everday Anarchy", "Fake Lager", "Galaxy", "Goldings - IPA Is Dead",
"Growler", "Hardcore IPA", "Hardkogt IPA", "HBC 366 - IPA Is Dead",
"HBC 369", "Hello My Name Is Beastie", "Hello My Name Is Holy Moose",
"Hello My Name Is Ingrid", "Hello My Name Is Little Ingrid",
"Hello My Name Is Mette-Marit", "Hello My Name Is PaÌ\210ivi",
"Hello My Name is Sonja (w/ Evil Twin)", "Hello My Name is Vladimir",
"Hello My Name Is ZeÌ\201 (w/ 2Cabeças)", "Hinterland", "Hobo Pop",
"Hop Fiction - Prototype Challenge", "Hopped-Up Brown Ale - Prototype Challenge",
"Hoppy Christmas", "Hops Kill Nazis", "Hunter Foundation Pale Ale",
"Hype", "India Session Lager - Prototype Challenge", "International Arms Race (w/ Flying Dog)",
"Interstellar", "Jack Hammer", "Jasmine IPA", "Jet Black Heart",
"Kohatu - IPA Is Dead", "Konnichiwa Kitsune", "Libertine Black Ale",
"Libertine Porter", "Lichtenstein Pale Ale", "Lizard Bride - Prototype Challenge",
"Lost Dog (w/Lost Abbey)", "Lumberjack Stout", "Magic Stone Dog (w/Magic Rock & Stone Brewing Co.)",
"Mandarina Bavaria - IPA Is Dead", "Mango Gose - B-Sides", "Melon And Cucumber IPA - B-Sides",
"Misspent Youth", "Morag's Mojito - B-Sides", "Moshi Moshi 15",
"Motueka", "Movember", "Mr.Miyagi's Wasabi Stout", "Nanny State",
"Nelson Sauvin", "Neon Overlord", "Never Mind The Anabolics",
"No Label", "Nuns With Guns", "Old World India Pale Ale", "Old World Russian Imperial Stout",
"Orange Blossom - B-Sides", "Pale - Russian Doll", "Paradox Islay",
"Paradox Islay 2.0", "Paradox Jura", "Peroxide Punk", "Pilsen Lager",
"Pioneer - IPA Is Dead", "Prototype 27", "Prototype Helles",
"Prototype Pils 2.0", "Pumpkin King", "Punk IPA 2007 - 2010",
"Punk IPA 2010 - Current", "Restorative Beverage For Invalids And Convalescents",
"Rhubarb Saison - B-Sides", "Riptide", "Russian Doll â\200“ India Pale Ale",
"Rye Hammer", "San Diego Scotch Ale (w/Ballast Point)", "Santa Paws",
"Shareholder Black IPA 2011", "Ship Wreck", "Shipwrecker Circus (w/ Oskar Blues)",
"Simcoe", "Sink The Bismarck!", "Skull Candy", "Sorachi Ace",
"Sorachi Bitter - B-Sides", "Spiced Cherry Sour - B-Sides", "Stereo Wolf Stout - Prototype Challenge",
"Storm", "Sub Hop", "Sunk Punk", "Sunmaid Stout", "Sunshine On Rye - B-Sides",
"The Physics", "This. Is. Lager", "TM10", "Trashy Blonde", "Truffle and Chocolate Stout - B-Sides",
"U-Boat (w/ Victory Brewing)", "Vagabond Pale ALe - Prototype Challenge",
"Vagabond Pilsner", "Vic Secret", "Waimea - IPA Is Dead", "Whisky Sour - B-Sides",
"Zephyr"), class = "factor"), ABV = c(4.5, 4.1, 4.2, 6.3, 7.2,
NA, 4.7, 7.5, 7.3, 5.3, 4.5, 4.5, 6.1, 11.2, 6, 8.2, 12.5, 8,
4.7, 3.5, 15, 6.7, 7.8, 6.7, 0.5, 7.5, 5.8, 3.6, 10.5, 12.5,
7.2, 8.2, 10.7, 9.2, 7.1, 5, 16.5, 12.8, 6.7, 10, NA, 10, 4.5,
7.4, 7.2, 9.5, 9.2, 9, 7.2, 7.5, NA, 10.43, 7.1, 8, 5, 5.4, 4.1,
10.2, 4, 7, 12.7, 6.5, 7.5, 4.2, 11.8, 7.6, 15, 4.4, 6.3, 7.2,
NA, 4.5, 4.5, 7.5, 10, 3.8, 6.4, NA, 4, 15.2, 5.4, 8.3, 6.5,
8, 12, 8.2, 5.6, 7.2, 6.3, 10, 5.6, 4.5, 8.2, 8.4, 6, 6.7, 6.5,
11.5, 8.5, 5.2, 7.1, 4.7, 6.7, 9, 6.5, 6.7, 5, 5.8, 7.5, 4.5,
9, 41, 15, 8.5, 7.2, 9, 3.8, 5.7, 6.3, 7.5, 4.4, 18, 10.5, 11.3,
NA, 5.2, 4.5, 9.5, 7.2, 2.7, 6.4, 17.2, 8.5, 4.9, 4.7, 7.2, 10,
4.5, 7.2, 7.2, 6.7, 7.2, 4.4, 9, 7.5, 16.1, 6.7, 2.5, 7.4, 2.8,
4.2, 5.8, 5.2, 10, 12.8, 8.3, 6.5, 6, 3, 7.6, 5.5, 8.8, 5.2,
5.2, 8, 6.7, 15, 11.5, 7.1, NA, 7.5, 7.2, 5.2, 6.8, 5.5, 5.2,
6.7, 5, 9, 9.2, 13.8, 4.5, 3.2, 16.1, 4.7, 14.2, 13, 7.2, 9.2,
4.9, 7.2, 7.2, 4.5, 4.5, 4.5, 7.6), IBU = c(60, 41.5, 8, 55,
59, 38, 40, 75, 30, 60, 50, 42, 45, 150, 70, 70, 100, 60, 45,
33, 90, 67, 70, 70, 55, 75, 35, 8, 85, 125, 70, 70, 100, 125,
65, 47, 20.5, 50, 70, 35, 20, 55, 35, 65, 70, 85, 149, 65, 100,
30, 30, 65, 68, 35, 50, 35, 65, 50, 35, 20, 85, 35, 50, 50, 80,
70, 80, 35, 85, 70, 9, 35, 30, 70, 85, 35, 40, 45, 40, 20, 20,
70, 60, 45, 85, 42, 40, 70, 55, 85, 30, 55, 42, 50, 50, 40, 35,
80, 65, 45, 90, 45, 67, 85, 20, 67, 30, 40, 90, 38, 50, 1085,
90, 85, 100, 80, 20, 35, 130, 75, 35, 70, 14, 50, 25, 65, 25,
80, 70, 36, 50, 75, 100, 30, 37, 100, 80, 55, 50, 250, 67, 100,
70, 70, 80, 85, 70, 35, 70, 30, 25, 40, 50, 55, 70, 70, 55, 60,
8, 175, 35, 40, 45, 55, 85, 70, 90, 50, 80, 45, 0, 130, 55, 30,
60, 40, 70, 50, 85, 65, 60, 40, 8, 100, 25, 20, 100, 250, 50,
18, 250, 250, 40, 40, 40, 70), OG = c(1044, 1041.7, 1040, 1060,
1069, 1045, 1046, 1068, 1079, 1052, 1047, 1046, 1067, 1098, 1058,
1076, 1093, 1082, 1047, 1038, 1120, 1013, 1074, 1066, 1007, 1068,
1049, 1040, 1102, 1087, 1067, 1076, 1105, 1085, 1065, 1048.5,
1112, 1096, 1066, 1080, 1048, 1090, 1048, 1069, 1067, 1095, 1083,
1080, 1064, 1080, 1043, 1095, 1056, 1077, 1049, 1050, 1042, 1026,
1041, 1081, 1113.5, 1050, 1070, 1042, 1096, 1073, 1113, 1040,
1063, 1067, 1032, 1048, 1045, 1068, 1098, 1040, 1057, 1081, 1039,
1110, 1055, 1076, 1060, 1075, 1130, 1078, 1055, 1067, 1060, 1098,
1058, 1046, 1078, 1080, 1050, 1063, 1068, 1096, 1078, 1049, 1067,
1055, 1013, 1094, 1060, 1013, 1050, 1053, 1072, 1042.9, 1084,
1085, 1120, 1072, 1064, 1083, 1039, 1053, 1060, 1068, 1045, 1150,
1093, 1098, 1052, 1048, 1043, 1075, 1067, 1033, 1061, 1156, 1068,
1047, 1043, 1064, 1097, 1045, 1068, 1065, 1064, 1064, 1045, 1090,
1069, 1125, 1063, 1027, 1069, 1032.5, 1044, 1060, 1050, 1128,
1108, 1076, 1059, 1056, 1007, 1072, 1053, 1084, 1048, 1053, 1074,
1066, 1120, 1104, 1067, 1089, 1069, 1065, 1052, 1068, 1062, 1048,
1066, 1053, 1094, 1069, 1088, 1045, 1007, 1015, 1008, 1025, 1015,
1065, 1016, 1010, 1065, 1065, 1045, 1045, 1045, 1067), EBC = c(20,
15, 8, 30, 10, 15, 12, 22, 120, 200, 140, 62, 219, 70, 25, NA,
36, 12, 8, 50, 100, 19, 90, 30, 30, 30, 44, NA, 64, 40, 30, 16,
300, 40, 13, 65, 20, 111, 30, 80, 14, 300, 40, 60, 30, 250, 19.5,
97, 12, 46, 15, 23, 14, 15, 110, 11.5, 17, 197, 45, 12, 250,
23, 40, 30, 115, 59, 400, 12, 24, 30, 2, 44, 25, 30, 130, 25,
10, 15, 18, 158, 30, 30, 25, 240, 24, 90, 15, 30, 30, 30, 54,
25, 70, 200, 8, 15, 250, 115, 31.2, 45, 15, 200, 19, 400, NA,
19, 60, 177.3, 200, 18, 20, 40, 100, 15, 12, 180, 6, 25, 14,
30, 30, 57, NA, 164, 10, 16, 10, 195, 30, 57, 20, 128, 15, 12,
10, 12, 65, 20, 150, 15, 19, 12, 30, 190, 50, 400, 30, 10, 30,
42, 19, 35, 17, 300, 79, 30, 50, 17, 9, 40, 25, 190, 35, 165,
35, 30, 100, 38, 71, 15, 50, 14, 200, 86, 230, 13, 30, 200, 400,
60, 25, 18, 8, 500, 25, 67, 300, 15, 78.8, 13, 17, 104, 18, 18,
18, 20), PH = c(4.4, 4.4, 3.2, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4,
4.2, 5.2, 4.4, 4.4, 4.4, 5.2, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4,
4.4, 4.4, 4.4, 4.4, 4.4, 3.2, 4.4, 4.4, 4.4, 4.4, 4.3, 4.4, 4.4,
4.4, 4.4, 4.4, 4.4, 4.4, 4.2, 4.4, 4.4, 4.2, 4.4, 4.4, 4.4, 4.4,
4.4, 4.5, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 5.2, 3.2, 5.2,
4.4, 4.4, 4.4, 5.2, 4.4, 4, 4.4, 4.2, 4.4, 4.4, 4.4, 4.4, 4.4,
4.4, 4.4, 3.5, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4,
4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 5.2, 5.2, 4.2, 4.4, 4.4, 4.2,
4.4, 4.4, 4.4, 4.3, 3.2, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4,
4.4, 4.4, 5.2, 5.2, 4.4, 5.2, 4.4, 4.4, 4.4, 4.4, 4.4, 5.2, 5.2,
4.2, 4.5, 4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.2, 4.4, 4.4, 4.2, 4.4,
4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 4.3, 4.4, 4.2, 4.4, 4.4, 4.4, 4.4,
4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 3.2, 4.4, 4.4, 4.5, 4.4, 5.2, 5.2,
4.4, 4.4, 4.4, 4.4, 4.4, 4.4, 5.2, 5.2, 4.4, 4.4, 4.4, 4.4, 4.4,
4.3, 4.2, 4.4, 4.2, 3.2, 4.4, 4.2, 4, 4.4, 4.4, 4.2, 4.2, 4.4,
4.4, 4.2, 4.2, 4.2, 4.4), AttenuationLevel = c(75, 76, 83, 80,
67, 88.9, 78, 80.9, 74.7, 77, 74.5, 72.8, 70.1, 87, 79.3, 83,
68, 86, 79, 68.4, 98, 79.7, 79.7, 77.3, 28.6, 82.1, 90, 83, 102,
81.2, 82.1, 83, 76.2, 81.2, 85, 79.4, 100, 79.17, 77.3, 85, 89.6,
84.4, 72.9, 82.6, 82.1, 76.8, 83, 76, 84, 70, 81.4, 83.2, 82.1,
79.2, 79, 84, 76.2, 74.5, 75.6, 74, 76.8, 76, 81.4, 76.2, 79.2,
79.5, 84.1, 79.5, 82.6, 82.1, 88, 72.9, 75.6, 82.1, 79.6, 70,
87, 93.8, 76.9, 82, 74.6, 82.9, 83.3, 81.3, 102.3, 83.3, 78,
82.1, 80, 70, 74, 73.9, 83.3, 81.3, 87, 84, 70.6, 79.2, 84.6,
81.6, 80.6, 70, 79.7, 73.4, 87, 79.7, 76, 84.9, 79.2, 81, 82.1,
81.2, 98, 90.3, 84, 83.1, 87, 79.3, 83, 82.1, 73.3, 93.3, 80,
79.6, 87, 79, 79.1, 81.3, 82.1, 70.8, 80.3, 80.8, 95.6, 80.7,
83.7, 84, 79.4, 73.9, 78.6, 84.6, 79.7, 84, 82.9, 80, 82.6, 84,
81, 70.4, 82.6, 63.1, 72.7, 76.7, 80, 89, 81.5, 82.9, 81.4, 82.14,
82.5, 80.6, 79.3, 79.8, 77.1, 75.5, 82.4, 77.3, 98, 85, 79, 94.4,
81.1, 87, 73.1, 76.5, 67.7, 79.2, 77.3, 73.6, 73.4, 82.6, 83,
75.6, 78, 84, 75.6, 75.6, 84.4, 84.6, 81, 78.7, 84.6, 84.6, 75.6,
75.6, 75.6, 82), FermentationTempCelsius = c(19L, 18L, 21L, 9L,
10L, 22L, 10L, 19L, 19L, 19L, 19L, 22L, 18L, 17L, 19L, 19L, 19L,
19L, 19L, 19L, 19L, 19L, 18L, 19L, 19L, 19L, 19L, 21L, 21L, 21L,
19L, 21L, 21L, 21L, 9L, 19L, 20L, 21L, 19L, 19L, 22L, 21L, 19L,
18L, 19L, 18L, 19L, 19L, 19L, 12L, 23L, 21L, 10L, 9L, 19L, 19L,
19L, 21L, 19L, 19L, 18L, 18L, 21L, 19L, 20L, 20L, 21L, 10L, 19L,
19L, 21L, 19L, 19L, 19L, 21L, 19L, 20L, 23L, 19L, 21L, 19L, 21L,
19L, 20L, 21L, 21L, 19L, 19L, 19L, 21L, 19L, 9L, 22L, 14L, 20L,
19L, 19L, 20L, 18L, 14L, 19L, 19L, 19L, 21L, 20L, 19L, 19L, 19L,
21L, 10L, 21L, 21L, 19L, 18L, 19L, 21L, 20L, 17L, 20L, 19L, 19L,
22L, 19L, 20L, 20L, 19L, 15L, 19L, 19L, 19L, 19L, 21L, 21L, 10L,
12L, 19L, 21L, 19L, 19L, 21L, 19L, 19L, 20L, 21L, 22L, 21L, 99L,
19L, 19L, 22L, 16L, 19L, 19L, 21L, 18L, 21L, 19L, 19L, 19L, 21L,
17L, 21L, 19L, 19L, 19L, 19L, 19L, 21L, 19L, 23L, 19L, 20L, 19L,
19L, 19L, 19L, 19L, 19L, 21L, 18L, 21L, 19L, 21L, 21L, 12L, 21L,
21L, 21L, 21L, 12L, 21L, 21L, 19L, 19L, 19L, 21L), Yeast = structure(c(1L,
1L, 1L, 3L, 3L, 4L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 3L, 1L, 2L, 2L, 1L, 2L, 4L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L,
3L, 4L, 2L, 3L, 3L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L,
2L, 2L, 3L, 1L, 1L, 4L, 1L, 1L, 1L, 2L, 1L, 1L, 4L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 3L, 2L, 3L, 1L, 1L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 3L, 2L, 2L, 2L,
2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 4L, 1L, 1L, 2L, 1L,
1L, 1L, 2L, 2L, 3L, 3L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 2L, 1L, 1L, 2L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 1L,
1L, 1L, 2L), .Label = c("Wyeast 1056 - American Ale", "Wyeast 1272 - American Ale II",
"Wyeast 2007 - Pilsen Lager", "Wyeast 3711 - French Saison"), class = "factor")), class = "data.frame", row.names = c(NA,
-196L))
Updated
As #dcarlson recommended, you can run mice on the entire dataframe, then you can use complete to get the whole output dataframe. Then, you can join the new data with your original dataframe.
library(tidyverse)
library(mice)
imi <- mice(data, m=5, maxit=10)
imi_complete <- complete(imi)
res <- data %>%
dplyr::left_join(., imi_complete %>% dplyr::select(Name, ABV, EBC), by = "Name") %>%
dplyr::select(-c(ABV.x, EBC.x)) %>%
dplyr::rename("ABV" = ABV.y, "EBC" = EBC.y)
Output
head(res)
Name IBU OG PH AttenuationLevel FermentationTempCelsius Yeast ABV EBC
1 Buzz 60.0 1044.0 4.4 75.0 19 Wyeast 1056 - American Ale 4.5 20
2 Trashy Blonde 41.5 1041.7 4.4 76.0 18 Wyeast 1056 - American Ale 4.1 15
3 Berliner Weisse With Yuzu - B-Sides 8.0 1040.0 3.2 83.0 21 Wyeast 1056 - American Ale 4.2 8
4 Pilsen Lager 55.0 1060.0 4.4 80.0 9 Wyeast 2007 - Pilsen Lager 6.3 30
5 Avery Brown Dredge 59.0 1069.0 4.4 67.0 10 Wyeast 2007 - Pilsen Lager 7.2 10
6 Electric India 38.0 1045.0 4.4 88.9 22 Wyeast 3711 - French Saison 7.5 15
Old
Since there's no id column in the new dataframe, you can just mutate to replace the columns in the original dataframe with the output from the new dataframe. However, it would be better practice to impute directly into the original dataframe (as suggested by #dcarlson and #r2evans), so that you can ensure that you have the data on the correct rows.
library(tidyverse)
df_orig %>%
dplyr::mutate(ABV = df_new$ABV, EBC = df_new$EBC)
Output
id ABV EBC third
1 1 -61 -58 37.94029
2 2 -80 -67 47.81479
3 3 -62 -66 48.85903
4 4 -69 -78 23.18026
5 5 -51 -77 29.91952
Data
df_orig <-
structure(
list(
id = c(1, 2, 3, 4, 5),
ABV = c(
38.9932923251763,
20.0923723727465,
37.640398349613,
31.4673039061017,
49.192731983494
),
EBC = c(
42.341671793256,
32.936319950968,
33.8184517389163,
21.5938150603324,
22.8182014194317
),
third = c(
37.9402944352478,
47.8147878032178,
48.8590325415134,
23.1802612892352,
29.9195193173364
)
),
class = "data.frame",
row.names = c(NA,-5L)
)
df_new <-
structure(
list(
ABV = c(-61,-80,-62,-69,-51),
EBC = c(-58,-67,-66,-78,-77)
),
class = c("rowwise_df", "tbl_df", "tbl",
"data.frame"),
row.names = c(NA,-5L),
groups = structure(
list(.rows = structure(
list(1L, 2L, 3L, 4L, 5L),
ptype = integer(0),
class = c("vctrs_list_of",
"vctrs_vctr", "list")
)),
row.names = c(NA,-5L),
class = c("tbl_df",
"tbl", "data.frame")
)
)
I have the following dataset:
df1 <- structure(list(group_id = c(3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29),
score = c(35, 0, 37.5, 51.9, 43, 41, 36.9, 44.4, 27.5, 41.5,
60, 39.4, 39.5, 50, 55, 57.8, 44.7, 60.2, 40.4, 62.5, 61.1,
53.9, 67.2, 43.9, 37.6, 58.4, 34.1, 56.4, 41.5, 54.4, 50.3,
36.8, 41.4, 37.2, 51.3, 50.7, 75.4, 62.9, NA, 54.5, 53.9,
59.5, 24.5, 22.7, 53, 35.8, 28, 39.4, 44.5, NA, NA, 55.9,
52.5, 36, 43.5, 42.9, 25.5, 35, 46, NA, 60.2, 65.6, 30.5,
37.1, 49.1, 70.4, 34.1, 45.4, 30.8, 38.6, 28.7, 39.8, 38.5,
0, 72.6, 0, NA, 54.6, 0, 69.8, 31.6, 55.9, 47.3, 34.3, 0,
40.8, 69.7, 61.5, 48.6, 59.3, 0, 67.2, 52, 57, 0, NA, 0,
51.7, 47.1, 0)), row.names = c(NA, -100L), groups = structure(list(
.rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L,
21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L,
32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L,
43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L,
54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L,
65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L,
76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L, 85L, 86L,
87L, 88L, 89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L,
98L, 99L, 100L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -100L), class = c("tbl_df",
"tbl", "data.frame")), class = c("rowwise_df", "tbl_df", "tbl",
"data.frame"))
I need to create a new dataset with just one line of information by group. The information I need is to be summarized in two columns. The first column shows the mean score by group. Only values that are non-zero and non-missing should be computed in the mean score. The second column contains the number of non-zero and non-missing scores by group.
I do NOT want a distinct/unique count, but the old traditional count: if two scores have the same non-missing and non-zero value they still must be counted twice.
The expected result is:
df2 <- structure(list(group_id = c(3L, 10L, 16L, 26L, 27L, 29L), score = c(43.04,
49.56, 44.86, 49.05, 32.28, 54.18), n_individuals = c(14L, 20L,
21L, 8L, 5L, 17L)), class = "data.frame", row.names = c(NA, -6L
))
What I tried:
library(dplyr)
df2 <- df1 %>%
mutate(score = case_when(
score == 0 ~ NA_real_, #assign missing values to zeros
TRUE ~ score)) %>%
group_by(group_id) %>% #group by group_id
summarise(score = mean(score, na.rm = TRUE), #mean score
n_individuals = count(score)) #n of individuals with valid score
What I get:
Error: Problem with `summarise()` input `n_inviduals`. x no applicable method for 'tbl_vars' applied to an object of class "c('double', 'numeric')" i Input `n_inviduals` is `count(score)`. i The error occured in group 1: group_id = 3.
The count input will be a tibble or data.frame. Here, we could use n() - if we want the total number of rows or if we want the number of non-NA elements in 'score', create a logical vector with is.na and get the count with sum i.e. TRUE -> 1 and FALSE -> 0, so sum is kind of getting the count of 1s
library(dplyr)
df1 %>%
ungroup %>%
mutate(score = case_when(
score == 0 ~ NA_real_, #assign missing values to zeros
TRUE ~ score)) %>%
group_by(group_id) %>%
summarise(n_individuals = sum(!is.na(score) & score != 0),
score = mean(score, na.rm = TRUE) )
-output
# A tibble: 6 x 3
# group_id n_individuals score
#* <dbl> <int> <dbl>
#1 3 14 43.0
#2 10 20 49.6
#3 16 21 44.9
#4 26 8 49.0
#5 27 5 35.3
#6 29 17 54.2
In a previous question (for loop in irregular time series) I presented a dataframe (see the dput below) where user rnso gave the following, which works quite well for me.
for(ss in unique(mydf$site_id)){
for(cc in 3:12){
# do whatever function
print(max(mydf[mydf$site_id == ss, cc],na.rm=TRUE))
} }
> [1] 304 [1] 16.8 [1] 8.43 [1] 286 [1] 2 [1] 36 [1] 93 [1] 30 [1] 5.98
> [1] 69 [1] -38 [1] 14.7 [1] 7.85 [1] 515 [1] 2 [1] 18 [1] 180 [1] 106
> [1] 0.1 [1] 655'
I have been using
idx <- max(seq(along=data))
lastx <- signif(data[idx], digits=3)
to identify the last data point for each mydf[ss,cc], which works well.
Question: how can I further subset mydf[idx] to pull out the date for each idx? I've tried bunches of permutations, and usually get some flavor of "incorrect dimensions" error.
thanks all!
data:
mydf <- structure(list(site_id = c("39ADA00070", "39ADA00070", "39ADA00070",
"39ADA00070", "39ADA00070", "39ADA00070", "39ADA00070", "39ADA00070",
"39ADA00070", "39ADA00070", "39ADA00070", "39ADA00070", "39ADA00070",
"39ADA00070", "39ADA00070", "39ADA00070", "39ADA00070", "39ADA00070",
"39ADA00070", "39ADA00070", "39ADA00070", "39ADA00070", "39ADA00070",
"39ADA00070", "39ADA00070", "39ADA00070", "39ADA00070", "39ADA00070",
"39ADA00070", "39ADA00070", "39ALL00184", "39ALL00184", "39ALL00184",
"39ALL00184", "39ALL00184", "39ALL00184", "39ALL00184", "39ALL00184",
"39ALL00184", "39ALL00184", "39ALL00184", "39ALL00184", "39ALL00184",
"39ALL00184", "39ALL00184", "39ALL00184", "39ALL00184", "39ALL00184",
"39ALL00184", "39ALL00184", "39ALL00184", "39ALL00184", "39ALL00184"
), date = structure(c(6339, 8594, 9293, 9441, 10014, 10604, 11080,
11821, 12717, 12907, 13081, 13277, 13459, 13635, 13822, 14012,
14207, 14207, 14355, 14564, 14704, 14917, 15105, 15271, 15478,
15644, 15833, 15834, 16009, 16203, 7783, 8406, 8554, 8686, 9034,
9260, 9632, 9777, 10002, 10491, 10491, 11060, 11585, 12145, 12145,
12696, 13242, 13242, 13775, 14363, 14881, 15428, 15974), class = "Date"),
var1 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 159L, 148L,
149L, 134L, 179L, 205L, 193L, 109L, 109L, 177L, 75L, 272L,
150L, 115L, 232L, 230L, 183L, 159L, 159L, 304L, 220L, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
-98L, -98L, -38L, -74L, -74L, -80L, -48L), var2 = c(NA, NA,
NA, NA, NA, NA, NA, NA, 16.8, 16.8, 14.5, 14.2, 15.1, 14.5,
15, 15.2, 13.2, 13.2, 15, 15.2, 15.1, 14.4, 14.8, 15.2, 16.3,
NA, 14.3, 14.3, 15.6, 14.8, NA, 12, 14.7, NA, 14.6, NA, 13.7,
12.3, 12.5, 13.5, 13.5, 12.5, 13.1, 14.2, 14.2, 14.1, 12.5,
12.5, 13.5, 12.7, 12.6, 12.5, 12.6), var3 = c(NA, NA, NA,
NA, NA, NA, NA, NA, 7.35, 7.85, 7.5, 7.47, 7.62, 7.08, 7.08,
7.2, 7.4, 7.4, 7.26, 7.05, 6.56, 7.2, 7.42, 6.5, 7.81, 8.43,
7.57, 7.57, 7.42, 7.72, NA, 6.58, 6.8, NA, 7.75, NA, 7.06,
6.77, 6.41, 6.84, 6.84, 7.85, 7.13, 7.26, 7.26, 7.06, 7.14,
7.14, 7.11, 6.9, 7.11, 7.2, 7.1), var4 = c(NA, 283L, 216L,
223L, 256L, 165L, 192L, 216L, 173L, 216L, 179L, 282L, 146L,
227L, 141L, 210L, 160L, 162L, 157L, 140L, 235L, 166L, 216L,
NA, 162L, 193L, 286L, 274L, 163L, 209L, NA, 304L, 321L, 293L,
398L, 302L, 301L, 282L, 288L, 292L, 292L, 302L, 515L, 309L,
309L, 323L, 338L, 295L, 280L, 279L, 325L, 328L, 322L), var5 = c(NA,
NA, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), var6 = c(NA, NA,
29L, 32L, 36L, 24L, 25L, 29L, 27L, 27L, 24L, 32L, 21L, 27L,
21L, 26L, 23L, 24L, 25L, 20L, 24L, 22L, 28L, 24L, 20L, 23L,
30L, 29L, 21L, 24L, 15L, 15L, 18L, 15L, 15L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L,
15L, 15L, 15L), var7 = c(NA, NA, 77, 83, 87, 66, 73, 73,
65, 76, 69, 93, 60, 76, 56, 77, 67, 68, 68, 60, 67, 63, 82,
69, 56, 68, 85, 83, 59, 68.2, 157, 159, 164, 169, 155, 176,
156, 156, 162, 162, 162, 160, 180, 163, 163, 158, 168, 171,
162, 167, 177, 167, 168), var8 = c(NA, NA, 25, 26, 29, 21,
22, 23, 20, 23, 21, 30, 17, 24, 16, 23, 20, 20, 21, 17, 23,
18, 25, 20, 17, 21, 27, 27, 17, 20.9, 91, 89, 96, 92, 86,
100, 89, 91, 92, 94, 94, 91, 97, 91, 91, 92, 98, 99, 94,
100, 106, 98, 100), var9 = c(1.02, 1, 0.37, 0.48, 0.88, 0.16,
0.17, 0.24, 0.25, 5.98, 0.26, 0.54, 0, 0.19, 0, 0.18, 0.14,
0.13, 0.16, 0.11, 0.19, 0.16, 0.26, NA, 0.11, 0.27, 0.19,
0.19, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, NA, 0.1, 0.1, 0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0, 0, 0.1, 0.1,
0.1), var10 = c(50, 48, 64, 55, 52, 64, 69, 63.3, 56.1, 40.6,
58.6, 43.9, 62.2, 51.9, 55.6, 53.4, 61.3, 61, 61.1, 61.9,
51.5, 60.7, 52.2, NA, 66, 52.8, 46.8, 47.5, 59.2, 53.4, NA,
560, 650, 540, 548, 655, 565, 531, 540, 501, 501, 531, 535,
547, 547, 492, 537, 542, 512, 542, 548, 581, 540)), class = "data.frame", row.names = c(NA,
-53L), .Names = c("site_id", "date", "var1", "var2", "var3",
"var4", "var5", "var6", "var7", "var8", "var9", "var10"))
Here how I would do this:
library(data.table)
library(reshape2)
dcast(melt(setDT(mydf),id.vars='site_id')
[,max(value,na.rm=TRUE),'site_id,variable'],
site_id ~variable)
site_id date var1 var2 var3 var4 var5 var6 var7 var8 var9 var10
1 39ADA00070 16203 304 16.8 8.43 286 2 36 93 30 5.98 69
2 39ALL00184 15974 -38 14.7 7.85 515 2 18 180 106 0.10 655
I am using data.table for grouping operations
put your data in the long format since you do to perform the same operation in many columns
for each variable( even dates) get the maximum value
reshape your data again to the wide format using dcast.