This question already has an answer here:
R: add alpha-value to png-image
(1 answer)
Closed 2 years ago.
I have a two-dimensional density plot which I need to overlay onto a photograph. I can read-in the picture with the package rtiff:
install.packages("rtiff")
library(rtiff)
x <- readTiff('F01_screenshot.tiff')
plot(x)
The data for the density plot is a very large dataframe called eet2 (see reproducible data below); the code for the density plot is this:
library(ggplot2)
ggplot(eet2, aes(x=X, y= Y)) +
geom_point() + stat_density2d()
I can easily produce both plots separately but don't know how to overlay the density plot onto the photograph in such a way that the density plot is scaled down proportionately to fit the picture. Any help is much appreciated!
Reproducible data:
eet2 <- dput(eet1[1:1000, 2:3])
structure(list(X = c(0, 177.378, 27.289, 0, 852.719, 0, 852.813,
71.068, 0, 144.875, 0, 140.385, 21.598, 0, 170.325, 0, 136.746,
21.038, 0, 146.279, 0, 141.86, 11.822, 0, 146.078, 0, 137.681,
21.182, 0, 148.867, 0, 132.886, 20.444, 0, 146.129, 0, 80.251,
6.688, 0, 141.08, 0, 149.789, 23.044, 0, 74.097, 0, 141.182,
21.72, 0, 83.075, 0, 81.192, 6.766, 0, 849.784, 0, 153.96, 23.686,
0, 78.374, 0, 142.782, 21.967, 0, 72.929, 0, 142.922, 11.91,
0, 83.639, 0, 143.912, 22.14, 0, 134.809, 0, 142.826, 21.973,
0, 132.7, 0, 85.876, 7.156, 0, 138.935, 0, 80.951, 12.454, 0,
144.385, 0, 141.716, 21.802, 0, 76.768, 0, 74.406, 6.2, 0, 134.444,
0, 155.341, 23.899, 0, 189.336, 0, 224.517, 34.541, 0, 207.46,
0, 216.122, 18.01, 0, 204.552, 0, 208.524, 32.081, 0, 207.513,
0, 203.162, 31.256, 0, 197.578, 0, 204.362, 17.03, 0, 195.223,
0, 956.396, 147.138, 0, 201.969, 0, 224.989, 34.614, 0, 214.064,
0, 140.374, 11.698, 0, 140.235, 0, 958.501, 147.462, 0, 141.898,
0, 143.337, 22.052, 0, 955.901, 0, 954.965, 79.58, 0, 131.323,
0, 223.214, 34.341, 0, 952.203, 0, 143.102, 22.016, 0, 935.525,
0, 918.238, 76.52, 0, 123.337, 0, 127.93, 19.682, 0, 915.755,
0, 128.336, 19.744, 0, 913.158, 0, 120.076, 10.006, 0, 911.196,
0, 124.387, 19.136, 0, 911.631, 0, 911.389, 140.214, 0, 910.433,
0, 119.375, 9.948, 0, 115.898, 0, 910.073, 140.011, 0, 915.461,
0, 965.025, 148.465, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1125.565,
0, 1074.389, 165.291, 0, 1061.611, 0, 1051.23, 161.728, 0, 1044.3,
0, 269.733, 22.478, 0, 245.869, 0, 206.934, 234.525, 245.609,
250.874, 251.576, 260.164, 262.974, 263.161, 269.25, 262.693,
261.818, 265.634, 266.479, 266.592, 266.39, 262.751, 262.508,
265.093, 262.666, 262.343, 1059.856, 1075.848, 1077.98, 1068.255,
1065.442, 1065.255, 1071.821, 1072.292, 1072.355, 1073.788, 1072.572,
1072.41, 1074.295, 1074.487, 1074.499, 1078.434, 1080.074, 1080.293,
1081.104, 1080.014, 1079.869, 1080.261, 1080.181, 1080.175, 1078.568,
1079.422, 1079.536, 1078.84, 1077.459, 1077.275, 1077.805, 1076.787,
1076.719, 1077.199, 1076.045, 1075.891, 1077.362, 1076.663, 1076.569,
1077.942, 1078.842, 1078.902, 1078.713, 1080.023, 1080.198, 1080.764,
1081.553, 1081.659, 1081.946, 1082.445, 1082.479, 1083.987, 1084.113,
1084.13, 1084.741, 1085.809, 1085.952, 1085.031, 1084.708, 1084.686,
1085.901, 1085.266, 1085.181, 1085.513, 1085.159, 1085.112, 1088.024,
1088.29, 1088.308, 1088.889, 1088.327, 1088.252, 1069.543, 1065.069,
1064.472, 1067.409, 1068.075, 1068.119, 1068.377, 1070.019, 1070.238,
1072.384, 1071.454, 1071.33, 1072.31, 1069.704, 1069.53, 1068.856,
1073.986, 1074.67, 1075.824, 1077.42, 1077.633, 1080.095, 1082.62,
1082.788, 1087.112, 1090.026, 1090.414, 1094.654, 1098.678, 1099.215,
1103.801, 1097.674, 1097.266, 1110.129, 1105.4, 1104.769, 1115.638,
1115.922, 1115.96, 1115.612, 1115.278, 1115.256, 1114.682, 1112.96,
1112.73, 1063.573, 948.249, 932.873, 844.512, 834.23, 833.545,
865.203, 867.402, 867.695, 838.499, 864.3, 867.74, 837.215, 864.187,
865.985, 0, 760.653, 862.073, 861.199, 860.757, 860.699, 0, 861.681,
71.807, 0, 851.287, 855.415, 855.965, 859.554, 864.523, 865.185,
868.109, 873.22, 873.561, 845.881, 882.877, 887.81, 893.391,
899.782, 900.634, 908.492, 56.781, 0, 0, 0, 0, 0, 0, 0, 319.09,
0, 0, 0, 0, 0, 0, 0, 0, 0, 696.905, 0, 704.008, 58.667, 0, 0,
0, 0, 677.276, 673.676, 673.195, 673.45, 671.209, 671.06, 672.474,
674.07, 674.283, 674.896, 656.33, 653.855, 678.654, 681.884,
682.099, 684.933, 715.507, 719.584, 735.659, 1290.244, 1364.188,
1362.632, 1366.322, 1366.568, 1371.973, 1364.1, 1363.051, 1361.383,
812.325, 739.117, 734.716, 1322.199, 1361.365, 730.422, 731.132,
731.227, 729.084, 724.164, 723.507, 726.199, 725.939, 725.922,
725.437, 1278.917, 1352.714, 726.791, 1281.431, 1355.383, 1366.558,
764.435, 724.294, 725.46, 724.544, 724.422, 724.178, 1288.932,
1364.232, 723.902, 680.495, 677.601, 724.172, 722.856, 722.681,
735.236, 743.706, 744.835, 554.779, 735.387, 747.427, 724.914,
720.25, 719.628, 721.122, 742.971, 745.884, 742.171, 737.947,
737.665, 729.207, 808.032, 818.542, 905.216, 106.496, 0, 1169.534,
0, 1195.852, 99.654, 0, 1236.396, 0, 0, 0, 0, 1190.473, 183.15,
0, 1143.686, 1132.754, 1132.025, 1123.856, 1103.6, 1100.9, 1083.716,
1079.191, 1078.588, 1083.785, 1070.395, 1069.503, 1066.885, 1065.121,
1064.885, 1064.401, 1065.285, 1065.403, 1075.009, 1076.036, 1076.105,
1086.308, 1075.821, 1074.423, 1073.282, 126.268, 0, 1092.281,
0, 1019.684, 1087.663, 286.389, 33.693, 0, 1117.064, 0, 1114.254,
171.424, 0, 1111.518, 0, 1110.347, 92.529, 0, 1077.293, 0, 1107.754,
170.424, 0, 1107.136, 0, 1076.571, 165.626, 0, 1067.475, 0, 1110.71,
92.559, 0, 274.852, 0, 280.654, 43.177, 0, 280.278, 271.773,
270.639, 0, 1060.434, 88.369, 0, 1054.964, 1047.876, 1046.931,
1052.977, 1064.93, 1066.524, 1061.181, 1069.339, 1069.883, 275.888,
970.662, 1063.299, 268.308, 974.232, 1068.355, 1044.757, 1065.286,
1066.655, 1076.68, 1085.54, 1086.721, 1088.216, 1083.409, 1082.768,
1087.272, 1087.283, 1087.283, 1086.689, 1087.19, 1087.257, 1089.988,
1087.837, 1087.55, 272.953, 1040.007, 1091.144, 1076.712, 1089.387,
1091.077, 1103.104, -360.56, -555.715, 1072.081, 1073.131, 1073.201,
1073.04, 1089.257, 1091.419, 1072.888, 1073.111, 1073.141, 1104.182,
1075.399, 1073.48, 1103.711, 1076.133, 1072.456, 1104.247, 1105.181,
1105.305, 1072.744, 1104.02, 1106.105, 1074.673, 1102.759, 1106.503,
1076.082, 1104.401, 1108.177, 1078.025, 1109.772, 1111.888, 288.949,
292.278, 292.722, 1112.981, 1123.139, 1124.493, 1127.996, 1134.943,
1135.406, 1136.071, 1141.749, 1142.506, 1148.754, 1150.764, 1151.032,
1156.093, 1154.597, 1154.497, 1150.481, 1124.218, 1120.716, 1108.646,
1095.181, 1093.385, 1113.175, 1089.783, 1088.224, 1719.795, -286.164,
-553.625, 1112.49, 1099.064, 1097.274, 1106.601, 1105.705, 1105.645,
1100.75, 1110.143, 1111.395, 1114.044, 1108.662, 1107.944, 1115.615,
1114.492, 1114.417, 296.606, 1011.559, 1106.886, 213.15, 735.037,
804.622, 643.816, 590.868, 587.338, 588.443, 591.022, 591.366,
595.437, 598.575, 598.994, -70.23, 562.977, 605.191, 608.988,
14.755, -64.476, -86.326, 490.189, 567.057, 573.015, 572.841,
572.83, 576.607, 578.243, 578.461, 579.714, 1343.595, 1445.446,
585.277, 588.153, 588.345, 588.809, 588.843, 588.848, 592.898,
591.447, 591.254, 592.538, 589.623, 589.429, 590.233, 587.901,
587.59, 589.276, 603.526, 605.426, 691.889, 810.649, 818.566,
889.948, 866.466, 863.335, 164.367, 164.895, 164.966, 158.056,
851.73, 897.975, 164.51, 806.786, 892.423, 894.069, 892.465,
892.251, 153.408, 842.909, 888.876, 151.935, 802.889, 889.683,
154.817, 146.446, 145.329, 888.428, 193.529, 147.202, 885.088,
224.817, 136.78, 147.255, 802.581, 889.958, 888.64, 889.975,
890.064, 1575.971, 968.922, 887.982, 889.232, 860.991, 857.226,
872.007, 893.987, 895.452, 859.373, 892.292, 896.681, 859.84,
859.942, 859.955, 898.274, 860.41, 857.885, 879.17, 862.76, 860.572,
856.434, 854.683, 854.45, 851.949, 174.027, 128.832, 848.35,
847.791, 847.716, 847.847, 849.008, 849.163, 133.198, 806.744,
851.647, 146.291, 773.622, 857.267, 860.622, 834.937, 831.512,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 623.517, 0, 631.712, 97.187,
0, 631.526, 0, 641.912, 98.756, 0, 648.161, 0, 654.673, 54.556,
0, 659.733, 0, 603.775, 684.278, 696.548, 689.181, 688.198, 687.25,
682.216, 681.881, 711.089, 713.482, 713.802, 713.969, 715.124,
715.278, 711.369, 709.299, 709.162, 679.428, 675.016, 674.428,
675.948, 673.724, 673.427, 675.195, 678.8, 679.041, 681.486,
686.341, 686.988, 696.445, 694.867, 694.657, 703.31, 705.003,
705.116, 0, 711.882, 109.52, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1269.549, 1438.822, 0, 720.784, 60.065, 0, 721.678, 0, 713.535,
109.775, 0, 709.244, 0, 681.534, 104.851, 0, 693.092, 0, 681.394,
56.783, 0, 679.538, 0, 680.866, 104.749, 0, 675.661, 0, 668.345,
102.822, 0, 0, 0, 0, 0, -69.364, -10.671, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 677.983, 0), Y = c(0, -560.914,
-86.294, 0, 12.57, 0, 10.336, 0.861, 0, -569.851, 0, -582.641,
-89.637, 0, -513.461, 0, -582.556, -89.624, 0, -567.846, 0, -574.869,
-47.906, 0, -567.25, 0, -571.902, -87.985, 0, -554.095, 0, -574.651,
-88.408, 0, -557.127, 0, -365.662, -30.472, 0, -565.123, 0, -564.378,
-86.827, 0, -380.282, 0, -587.338, -90.36, 0, -373.494, 0, -394.186,
-32.849, 0, -15.305, 0, -586.833, -90.282, 0, -391.571, 0, -603.611,
-92.863, 0, -392.659, 0, -595.738, -49.645, 0, -369.805, 0, -585.56,
-90.086, 0, -607.653, 0, -598.77, -92.118, 0, -603.821, 0, -362.726,
-30.227, 0, -590.569, 0, -365.806, -56.278, 0, -584.274, 0, -594.203,
-91.416, 0, -378.246, 0, -385.426, -32.119, 0, -585.678, 0, -569.332,
-87.589, 0, -572.327, 0, -566.971, -87.226, 0, -586.06, 0, -584.482,
-48.707, 0, -593.944, 0, -583.298, -89.738, 0, -580.776, 0, -590.48,
-90.843, 0, -600.616, 0, -598.535, -49.878, 0, -602.826, 0, -29.44,
-4.529, 0, -618.062, 0, -592.995, -91.23, 0, -609.041, 0, -411.878,
-34.323, 0, -413.262, 0, -52.608, -8.094, 0, -411.569, 0, -415.928,
-63.989, 0, -48.63, 0, -47.12, -3.927, 0, -415.299, 0, -599.916,
-92.295, 0, -44.314, 0, -411.139, -63.252, 0, -46.124, 0, -48.509,
-4.042, 0, -407.485, 0, -413.499, -63.615, 0, -55.391, 0, -415.922,
-63.988, 0, -57.45, 0, -422.827, -35.236, 0, -54.607, 0, -414.86,
-63.825, 0, -51.45, 0, -48.986, -7.536, 0, -45.942, 0, -413.839,
-34.487, 0, -411.075, 0, -38.956, -5.993, 0, -14.525, 0, 35.454,
5.455, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 279.172, 0, 315.145,
48.484, 0, 313.523, 0, 314.854, 48.439, 0, 314.166, 0, -89.369,
-7.447, 0, -125.57, 0, -138.883, -157.401, -162.487, -158.14,
-157.561, -171.662, -160.906, -160.189, -166.883, -171.154, -171.724,
-187.379, -175.901, -174.37, -188.734, -185.215, -184.98, -189.818,
-189.088, -188.991, 283.22, 302.83, 305.444, 270.066, 289.663,
290.969, 275.119, 276.953, 277.198, 260.347, 261.677, 261.855,
246.632, 250.916, 251.202, 238.808, 246.008, 246.968, 240.285,
253.356, 255.099, 251.122, 262.086, 262.817, 258.677, 268.433,
269.734, 264.812, 273.771, 274.966, 268.696, 276.112, 276.606,
268.26, 275.135, 276.051, 265.545, 272.265, 273.161, 264.077,
268.116, 268.385, 261.015, 266.217, 266.911, 259.724, 265.892,
266.714, 259.602, 265.93, 266.352, 258.209, 265.537, 266.514,
259.411, 265.687, 266.523, 259.761, 266.978, 267.459, 260.97,
268.01, 268.949, 262.4, 271.867, 273.13, 264.803, 273.569, 274.153,
266.979, 275.25, 276.353, 268.073, 274.926, 275.84, 269.625,
279.424, 280.077, 271.785, 281.741, 283.068, 276.083, 284.162,
285.239, 279.865, 291.741, 292.533, 291.52, 302.124, 303.538,
300.685, 313.141, 314.802, 313.594, 327.048, 327.945, 325.593,
339.796, 341.69, 341.58, 355.702, 357.585, 356.602, 381.738,
383.413, 370.869, 394.752, 397.936, 391.659, 409.376, 411.739,
415.584, 433.281, 434.461, 437.478, 454.368, 456.62, 445.173,
427.865, 425.558, 383.004, 370.373, 369.531, 382.343, 399.016,
401.239, 413.831, 422.157, 423.267, 432.015, 448.845, 449.967,
0, 425.956, 482.75, 487.1, 501.178, 503.055, 0, 508.065, 42.339,
0, 492.242, 494.125, 494.376, 484.667, 484.368, 484.328, 467.742,
464.267, 464.035, 468.697, 433.854, 429.208, 400.85, 393.727,
392.777, 376.735, 23.546, 0, 0, 0, 0, 0, 0, 0, 885.955, 0, 0,
0, 0, 0, 0, 0, 0, 0, 500.323, 0, 481.941, 40.162, 0, 0, 0, 0,
530.014, 516.432, 514.621, 494.618, 478.341, 477.256, 456.114,
458.922, 459.297, 456.514, 425.5, 421.365, 462.707, 472.408,
473.055, 467.768, 454.51, 452.742, 441.741, 201.307, 169.249,
167.631, 163.795, 163.539, 160.509, 161.775, 161.943, 161.298,
397.997, 429.557, 419.546, 177.979, 161.875, 418.022, 423.34,
424.049, 413.55, 419.645, 420.457, 407.472, 415.496, 416.031,
407.006, 148.509, 114.042, 409.442, 164.128, 131.419, 109.849,
407.293, 427.122, 418.844, 426.465, 427.481, 417.779, 161.317,
127.123, 409.12, 367.918, 365.171, 395.673, 396.298, 396.381,
388.306, 392.859, 393.466, 473.545, 400.213, 395.325, 366.771,
370.934, 371.489, 339.469, 387.501, 393.905, 386.022, 390.883,
391.207, 378.721, 394.352, 396.436, 384.581, 45.245, 0, 380.042,
0, 375.485, 31.29, 0, 335.187, 0, 0, 0, 0, 323.048, 49.7, 0,
359.169, 361.068, 361.195, 348.111, 362.176, 364.051, 383.932,
392.518, 393.663, 349.699, 392.668, 395.532, 391.36, 396.741,
397.458, 387.906, 391.383, 391.846, 371.849, 378.048, 378.461,
318.908, 357.694, 362.865, 342.973, 40.35, 0, 288.149, 0, 259.291,
276.577, -102.466, -12.055, 0, 241.977, 0, 234.288, 36.044, 0,
247.849, 0, 244.967, 20.414, 0, 290.17, 0, 258.996, 39.846, 0,
269.854, 0, 259.002, 39.847, 0, 297.18, 0, 266.569, 22.214, 0,
-87.029, 0, -89.617, -13.787, 0, -95.348, -79.043, -76.869, 0,
246.968, 20.581, 0, 272.257, 289.812, 292.152, 271.933, 287.652,
289.748, 276.443, 284.481, 285.017, -116.323, 230.711, 276.983,
-141.084, 229.411, 278.81, 314.961, 280.304, 277.994, 277.686,
282.62, 283.278, 287.306, 282.107, 281.414, 290.288, 291.936,
292.045, 288.296, 302.772, 304.702, 292.15, 302.242, 303.588,
-136.669, 270.271, 297.4, 274.154, 301.517, 305.165, 290.369,
-426.184, -521.724, 321.366, 329.046, 329.558, 321.7, 306.849,
304.869, 321.009, 327.981, 328.91, 290.339, 328.378, 330.914,
289.022, 325.172, 329.992, 288.447, 295.19, 296.089, 322.57,
298.091, 296.459, 321.394, 300.861, 298.123, 320.881, 300.679,
297.985, 319.966, 296.464, 294.897, -164.893, -160.079, -159.437,
278.924, 281.357, 281.682, 269.259, 274.752, 275.118, 267.165,
265.143, 264.873, 258.509, 264.614, 265.429, 256.744, 266.259,
266.893, 263.361, 301.473, 306.555, 262.416, 313.245, 320.022,
281.712, 320.535, 323.123, -194.846, -521.034, -564.526, 274.367,
278.267, 278.787, 264.133, 268.555, 268.85, 256.818, 266.194,
267.444, 260.084, 265.714, 266.464, 262.669, 272.063, 272.689,
-175.62, 225.568, 279.06, -159.431, 216.327, 266.428, 245.597,
254.043, 254.606, 244.851, 255.798, 257.258, 248.152, 256.508,
257.622, -319.948, 219.787, 255.769, 243.947, -235.689, -299.64,
-334.288, 164.071, 230.518, 224.188, 233.409, 234.024, 228.544,
236.363, 237.405, 235.462, -116.121, -162.999, 240.889, 251.796,
252.524, 246.8, 258.72, 260.309, 257.726, 267.828, 269.175, 265.087,
277.995, 278.856, 273.876, 287.09, 288.852, 280.858, 292.608,
294.174, 246.387, 213.675, 211.494, 201.469, 209.375, 210.429,
-217.38, -218.456, -218.6, -232.09, 196.612, 225.193, -248.699,
166.3, 221.633, 213.54, 219.095, 219.836, -261.438, 192.469,
222.729, -265.48, 164.234, 221.529, -254.99, -254.199, -254.094,
209.197, -218.172, -246.663, 207.52, -196.823, -250.735, -261.477,
163.4, 220.05, 210.838, 222.305, 223.069, -196.463, 176.753,
226.515, 217.024, 249.012, 253.277, 203.798, 226.753, 228.283,
241.225, 228.683, 227.011, 235.665, 243.606, 244.664, 212.621,
240.812, 242.692, 207.285, 207.473, 207.498, 194.745, 202.346,
203.359, 191.408, -221.759, -249.304, 186.022, 192.304, 193.141,
182.101, 191.751, 193.037, -260.225, 162.901, 191.109, -252.884,
141.317, 193.877, 186.754, 220.72, 225.249, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 304.742, 0, 316.744, 48.73, 0, 334.833, 0,
352.419, 54.218, 0, 368.749, 0, 378.178, 31.515, 0, 378.548,
0, 339.912, 385.233, 406.951, 373.432, 368.963, 411.64, 362.786,
359.529, 358.897, 353.738, 353.051, 336.054, 340.344, 340.915,
335.944, 355.461, 356.762, 345.061, 372.321, 375.955, 376.513,
404.13, 407.812, 416.338, 426.28, 426.943, 432.67, 429.483, 429.058,
426.368, 425.226, 425.074, 416.298, 411.677, 411.369, 0, 345.327,
53.127, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 183.69, 208.182, 0,
261.561, 21.797, 0, 298.1, 0, 281.941, 43.375, 0, 314.226, 0,
304.413, 46.833, 0, 374.171, 0, 368.525, 30.71, 0, 359.118, 0,
367.905, 56.601, 0, 287.494, 0, 285.213, 43.879, 0, 0, 0, 0,
0, 735.244, 113.115, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 304.619, 0)), row.names = 31537:32536, class = "data.frame")
This should give you a way to get the image you want. However, I don't have your original TIFF, and therefore the alignment won't be correct here since I had to use a cut-and-pasted version of the png in your question.
Anyway, the method I would use is:
Convert the image to a raster
Convert the raster to a grid::rasterGrob
Plot the rasterGrob as your first layer in ggplot using annotation_custom
Plot your other layers as normal.
Here's an example:
library(ggplot2)
library(rtiff)
library(grid)
x <- readTiff('F01_screenshot.tiff')
pic <- as.raster(array(c(x#red, x#green, x#blue), c(x#size, 3)))
picgrob <- rasterGrob(pic)
ggplot(eet2, aes(x=X, y= Y)) +
annotation_custom(picgrob) +
geom_point() +
stat_density2d() +
coord_equal()
You may need to scale your Y axis data to make it match the aspect ratio of the picture.
As an example, if we assume max(eet2$Y) is the top edge of the image, and min(eet2$Y) the bottom edge, and also assume that min(eet2$X) is the left edge and max(eet2$X) the right edge (as you have suggested is the case in your comments), we can marry the picture to the data like this:
pic_ratio <- dim(pic)[2]/dim(pic)[1]
data_ratio <- diff(range(eet2$X)) / diff(range(eet2$Y))
eet2$Y <- eet2$Y * data_ratio / pic_ratio
ggplot(eet2, aes(x=X, y= Y)) +
annotation_custom(picgrob) +
geom_point() +
stat_density2d() +
coord_equal(xlim = range(eet2$X), ylim = range(eet2$Y))
If this alignment is not correct, then we need extra calibration information not present in the data (i.e. what value of eet2$Y should represent the top and bottom of the image, and what value of eet2$X represents the left and right edges.
The data, AllBooks has 590 observations of 8266 variables. Here is the code I have:
AllBooks = read_csv("AllBooks_baseline_DTM_Unlabelled.csv")
dtms = as.matrix(AllBooks)
dtms_freq = as.matrix(rowSums(dtms) / 8266)
dtms_freq1 = dtms_freq[order(dtms_freq),]
sd = sd(dtms_freq)
mean = mean(dtms_freq)
This tells me that my mean is: 0.01242767
and my std. dev. is: 0.01305608
So since my standard deviation is low this means the data has low variability in terms of size of documents. So I do not need to normalize the DTM? And by normalize I mean using the scale function in R which subtracts the mean of the data and divides by the standard deviation.
In other words my big questions is: When am I suppose to standardize data (specifically a Document Term Matrix) for clustering purposes?
Here is a little output of data:
dput(head(AllBooks,10))
budding = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), enjoyer = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), needs = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), sittest = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), eclipsed = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), engagement = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
exuberant = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), abandons = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), well = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), cheerfulness = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
hatest = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), state = c(0, 0,
0, 0, 0, 0, 0, 0, 0, 0), stained = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), production = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), whitened = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), revered = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), developed = c(0, 0, 0, 2, 0, 0, 0, 0, 0, 0),
regarded = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), enactments = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), aromatical = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0), admireth = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0
), foothold = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), shots = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), turner = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), inversion = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
lifeless = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), postponement = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), stout = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), taketh = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), kettle = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), erred = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), thinkest = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), modern = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), reigned = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0), sparingly = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
visual = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), thoughts = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0), illumines = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0), attire = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
explains = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
You can view full data from link: https://www.dropbox.com/s/p9v1y6oxith1prh/AllBooks_baseline_DTM_Unlabelled.csv?dl=0
You have a sparse dataset, where most of it is dominated by zeros, hence standard deviation is very low. You can scale it if some of your non-zero counts are extremely large, eg some are 100s while others are 1s and 2s.
It might not be such a good idea to use kmeans on sparse data, because it is unlikely you can find meaningful centers. There might be a few options available, check this link on dimension reduction.There are also graph based approaches, such as this used in biology.
Below is a simplistic way to clust and visualize:
x = read.csv("AllBooks_baseline_DTM_Unlabelled.csv")
# remove singleton columns
x = x[rowMeans(x)>0,colSums(x>0)>1]
Treat it as binary and hierachical on a binary distance:
hc=hclust(dist(x,method="binary"),method="ward.D")
clus = cutree(hc,5)
Calculate PCA and visualize:
library(Rtsne)
library(ggplo2)
pca = prcomp(x,scale=TRUE,center=TRUE)
TS = Rtsne(pca$x[,1:30])
ggplot(data.frame(Dim1=TS$Y[,1],Dim2=TS$Y[,2],C=factor(clus)),
aes(x=Dim1,y=Dim2,col=C))+geom_point()
Cluster 5 seems to be very different, and they differ in these words:
names(tail(sort(colMeans(x[clus==5,]) - colMeans(x[clus!=5,])),10))
[1] "wisdom" "thee" "lord" "things" "god" "hath" "thou" "man"
[9] "thy" "shall"
I would like to find the smallest distance between the profiles stored in a data frame. I am interested especially in one row in comparison to the rest of the rows stored in the data frame.
That's a data frame:
structure(list(`10` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `34` = c(0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 393090, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6718400,
0, 311350, 0), `59` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2164949.7,
4834137.6, 0, 0, 0, 1187816.7, 0, 0, 0, 0, 0, 0, 1340912.5, 0
), `84` = c(0, 0, 0, 0, 0, 0, 0, 0, 8607100, 0, 0, 17586713.2,
22629743.6, 0, 0, 0, 2808791.7, 0, 0, 4026222.5, 0, 0, 0, 1981900,
0), `110` = c(2296000, 0, 0, 0, 0, 2140221.7, 0, 0, 5809230.6,
0, 0, 37134898.5, 3861828.7, 2553100, 0, 12075845.8, 0, 0, 1272950,
8695273, 0, 0, 2657180, 2710080, 0), `134` = c(0, 0, 0, 1176150,
0, 1329596.7, 1471000, 0, 6511934, 6511934, 0, 18709227.3, 0,
1041211.2, 0, 6544176.9, 0, 0, 2412651.7, 7724956.9, 2878418.3,
0, 8620131.7, 2386972.8, 0), `165` = c(0, 1226610, 0, 1345098.7,
2083771.9, 0, 1808231.4, 0, 0, 10742997.7, 0, 13060798.9, 0,
538340, 538340, 2791649.5, 0, 0, 6217622, 1316097.1, 4716931.8,
0, 6615816.9, 1510532, 0), `199` = c(0, 1571525, 0, 1903038.3,
1676700, 0, 888832.2, 0, 0, 9084418.6, 0, 11189460.1, 0, 0, 1807662.5,
2564275, 0, 0, 18080359.7, 0, 0, 0, 2397710.2, 1717949.2, 0),
`234` = c(0, 1314900, 2482696, 1325684, 0, 0, 0, 0, 0, 7321432.7,
0, 9843409.2, 0, 0, 1073341.7, 2762775, 0, 0, 9335312.8,
0, 0, 0, 1950788.2, 1509100, 0), `257` = c(0, 1568700, 14604298.7,
940162.2, 0, 0, 0, 0, 0, 4779505.9, 0, 9691692.4, 0, 0, 735290,
2650165, 0, 2311383.7, 5193383.4, 0, 0, 0, 1341998.7, 1225325.6,
0), `362` = c(0, 0, 4190740.5, 288800, 0, 0, 0, 0, 0, 4846634.8,
0, 9574498.7, 0, 0, 0, 1425600, 0, 8339312.1, 3877892.5,
0, 0, 0, 1752866.7, 0, 0), `433` = c(0, 0, 773280, 0, 0,
0, 0, 0, 0, 3926582.8, 3926582.8, 5962586.5, 0, 0, 0, 1041400,
0, 1972909.3, 1895439.4, 0, 0, 0, 963891.2, 0, 1109800),
`506` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9332272, 0, 0, 0,
0, 0, 0, 2219100, 0, 0, 0, 0, 0, 0, 0), `581` = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 4371537.1, 0, 0, 0, 0, 0, 0, 2428800,
0, 0, 0, 0, 0, 0, 0), `652` = c(0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1689871.4, 0, 0, 0, 0, 0, 0, 988399.7, 0, 0, 0, 0, 0,
0, 0), `733` = c(0, 0, 0, 0, 0, 0, 0, 1250100, 0, 0, 1754205.3,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `818` = c(0, 0,
0, 0, 0, 0, 0, 517340, 0, 0, 1149227.6, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0), `896` = c(0, 0, 0, 0, 0, 0, 0, 579846.7,
0, 0, 985931.2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
`972` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 858255.5, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `1039` = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 848993.3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0)), .Names = c("10", "34", "59", "84", "110", "134",
"165", "199", "234", "257", "362", "433", "506", "581", "652",
"733", "818", "896", "972", "1039"), row.names = c("Mark_1",
"Mark_2", "Alex_1", "Katrin_1", "Georg_1", "Martin_1",
"Tim_1", "Tom_1", "Mike_1", "Mike_2", "Mike_3",
"Hare_1", "Dea_1", "Monty_1", "Monty_2", "Niko_1",
"Lee_1", "Marq_1", "Otto_1", "Priaq_1", "Surkta_1",
"Norsa_1", "Norsa_2", "Quer_1", "Quer_2"), class = "data.frame")
So the row named Katrin_1 is the one which is interesting for me. I would like to find which rows have the smallest euclidean distance to Katrin_1. Let say 3-5 rows.
Let's get rid of Katrin_1 column with df[!rownames(df) %in% "Katrin_1", ], subtract df["Katrin_1", ] from each of the remaining rows with sweep, find Euclidean distances by squaring the resulting matrix element-wise and using rowSums, use which.min to get the final result:
names(which.min(rowSums(sweep(df[!rownames(df) %in% "Katrin_1", ], 2, as.numeric(df["Katrin_1", ]), `-`)^2)))
# [1] "Mark_2"
This should be much more efficient than using dist as dist would compute all possible distances, while we need need only a few.