Error: NAs introduced by coercion when converting character to numeric - r

I want to convert the column $Annual.income saved as character in my dataframe to numeric. The code I use gives NA values although the new class is numeric.
I have looked for answer on forums but none of the questions answer my problem:
I do not have NAs in the column Annual.income, there are only numbers. All the data is formated so as to have "." instead of "," for decimals .
Here is the code I use.
data$Annual.income <- as.numeric(as.character(data$Annual.income))
******************************UPDATE********************************************
Here is the dput of the column Annual.income.
dput(data$Annual.income)
c("34 500", "51 400", "43 200", "40 100", "36 400", "39 100",
"41 900", "48 700", "45 500", "45 500", "49 100", "35 100", "34 500",
"29 200", "32 200", "36 300", "35 800", "31 500", "33 000", "34 600",
"32 100", "32 000", "31 400", "33 200", "42 600", "29 200", "34 600",
"29 200", "34 100", "30 600", "34 034", "33 600", "31 000", "35 500",
"30 600", "30 600", "30 600", "30 800", "34 034", "33 200", "32 900"
)
The following still gives me NAs.
data$Annual.income <- as.numeric(data$Annual.income))
I imported the data using the Import dataset command of the Environement and unchecked stringAsfactor, checked heading = YES. Seperator = Semicolon , decimal = Period.
Thanks
...

The white space causes the problem here, simply remove all white space characters with gsub(), e.g.
Annual.income <- c("34 500", "51 400", "43 200", "40 100", "36 400", "39 100",
"41 900", "48 700", "45 500", "45 500", "49 100", "35 100", "34 500",
"29 200", "32 200", "36 300", "35 800", "31 500", "33 000", "34 600",
"32 100", "32 000", "31 400", "33 200", "42 600", "29 200", "34 600",
"29 200", "34 100", "30 600", "34 034", "33 600", "31 000", "35 500",
"30 600", "30 600", "30 600", "30 800", "34 034", "33 200", "32 900"
)
as.numeric(gsub("\\s", "", Annual.income))
#> [1] 34500 51400 43200 40100 36400 39100 41900 48700 45500 45500 49100
#> [12] 35100 34500 29200 32200 36300 35800 31500 33000 34600 32100 32000
#> [23] 31400 33200 42600 29200 34600 29200 34100 30600 34034 33600 31000
#> [34] 35500 30600 30600 30600 30800 34034 33200 32900
Created on 2019-05-17 by the reprex package (v0.2.1)

Related

Pivot table with weekly data

I have one data set which contain weekly data from sales from 4-stories (Store_1,Store_2,Store_3 and Store_4) in the period 2017 until 2019.
FINAL_SALES<-structure(list(year_week = structure(154:322, .Label = c("2014 01",
"2014 06", "2014 07", "2014 08", "2014 09", "2014 10", "2014 11",
"2014 12", "2014 13", "2014 14", "2014 15", "2014 16", "2014 17",
"2014 18", "2014 19", "2014 20", "2014 21", "2014 22", "2014 23",
"2014 24", "2014 25", "2014 26", "2014 27", "2014 28", "2014 29",
"2014 30", "2014 31", "2014 32", "2014 33", "2014 34", "2014 35",
"2014 36", "2014 37", "2014 38", "2014 39", "2014 40", "2014 41",
"2014 42", "2014 43", "2014 44", "2014 45", "2014 46", "2014 47",
"2014 48", "2014 49", "2014 50", "2014 51", "2014 52", "2015 01",
"2015 02", "2015 03", "2015 04", "2015 05", "2015 06", "2015 07",
"2015 08", "2015 09", "2015 10", "2015 11", "2015 12", "2015 13",
"2015 14", "2015 15", "2015 16", "2015 17", "2015 18", "2015 19",
"2015 20", "2015 21", "2015 22", "2015 23", "2015 24", "2015 25",
"2015 26", "2015 27", "2015 28", "2015 29", "2015 30", "2015 31",
"2015 32", "2015 33", "2015 34", "2015 35", "2015 36", "2015 37",
"2015 38", "2015 39", "2015 40", "2015 41", "2015 42", "2015 43",
"2015 44", "2015 45", "2015 46", "2015 47", "2015 48", "2015 49",
"2015 50", "2015 51", "2015 52", "2015 53", "2016 01", "2016 02",
"2016 03", "2016 04", "2016 05", "2016 06", "2016 07", "2016 08",
"2016 09", "2016 10", "2016 11", "2016 12", "2016 13", "2016 14",
"2016 15", "2016 16", "2016 17", "2016 18", "2016 19", "2016 20",
"2016 21", "2016 22", "2016 23", "2016 24", "2016 25", "2016 26",
"2016 27", "2016 28", "2016 29", "2016 30", "2016 31", "2016 32",
"2016 33", "2016 34", "2016 35", "2016 36", "2016 37", "2016 38",
"2016 39", "2016 40", "2016 41", "2016 42", "2016 43", "2016 44",
"2016 45", "2016 46", "2016 47", "2016 48", "2016 49", "2016 50",
"2016 51", "2016 52", "2017 01", "2017 02", "2017 03", "2017 04",
"2017 05", "2017 06", "2017 07", "2017 08", "2017 09", "2017 10",
"2017 11", "2017 12", "2017 13", "2017 14", "2017 15", "2017 16",
"2017 17", "2017 18", "2017 19", "2017 20", "2017 21", "2017 22",
"2017 23", "2017 24", "2017 25", "2017 26", "2017 27", "2017 28",
"2017 29", "2017 30", "2017 31", "2017 32", "2017 33", "2017 34",
"2017 35", "2017 36", "2017 37", "2017 38", "2017 39", "2017 40",
"2017 41", "2017 42", "2017 43", "2017 44", "2017 45", "2017 46",
"2017 47", "2017 48", "2017 49", "2017 50", "2017 51", "2017 52",
"2018 01", "2018 02", "2018 03", "2018 04", "2018 05", "2018 06",
"2018 07", "2018 08", "2018 09", "2018 10", "2018 11", "2018 12",
"2018 13", "2018 14", "2018 15", "2018 16", "2018 17", "2018 18",
"2018 19", "2018 20", "2018 21", "2018 22", "2018 23", "2018 24",
"2018 25", "2018 26", "2018 27", "2018 28", "2018 29", "2018 30",
"2018 31", "2018 32", "2018 33", "2018 34", "2018 35", "2018 36",
"2018 37", "2018 38", "2018 39", "2018 40", "2018 41", "2018 42",
"2018 43", "2018 44", "2018 45", "2018 46", "2018 47", "2018 48",
"2018 49", "2018 50", "2018 51", "2018 52", "2019 01", "2019 02",
"2019 03", "2019 04", "2019 05", "2019 06", "2019 07", "2019 08",
"2019 09", "2019 10", "2019 11", "2019 12", "2019 13", "2019 14",
"2019 15", "2019 16", "2019 17", "2019 18", "2019 19", "2019 20",
"2019 21", "2019 22", "2019 23", "2019 24", "2019 25", "2019 26",
"2019 27", "2019 28", "2019 29", "2019 30", "2019 31", "2019 32",
"2019 33", "2019 34", "2019 35", "2019 36", "2019 37", "2019 38",
"2019 39", "2019 40", "2019 41", "2019 42", "2019 43", "2019 44",
"2019 45", "2019 46", "2019 47", "2019 48", "2019 49", "2019 50",
"2019 51", "2019 52", "2020 01", "2020 02", "2020 03", "2020 04",
"2020 05", "2020 06", "2020 07", "2020 08", "2020 09", "2020 10",
"2020 11", "2020 12", "2020 13"), class = "factor"), Year = c(2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2018, 2018, 2018, 2018,
2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018,
2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018,
2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018,
2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018,
2018, 2018, 2018, 2018, 2019, 2019, 2019, 2019, 2019, 2019, 2019,
2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019,
2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019,
2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019,
2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019,
2019, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020,
2020, 2020, 2020), Store_1 = c(1371.61, 2398.37, 2107.29, 1752.56,
2394.57, 2309.07, 3266.67, 1993.1, 3021.57, 2922.96, 3357.87,
2317.24, 2563.67, 3883.79, 2688.5, 2583.24, 2486.72, 2521.87,
3646.1, 3271.04, 2568.42, 1565.41, 3412.02, 3051.4, 2440.17,
2394.38, 3193.14, 3375.54, 2527.95, 2873.94, 3788.03, 2772.86,
3642.49, 2346.12, 3266.1, 2070.62, 3788.6, 2456.32, 2664.18,
4027.62, 3024.04, 3128.92, 2118.12, 3440.9, 2670.26, 3509.11,
2866.53, 2894.65, 2650.5, 3331.08, 2845.44, 2743.79, 3379.72,
2291.97, 2707.31, 2322.75, 2895.41, 3134.62, 3487.83, 2406.54,
2601.1, 3402.14, 3654.46, 2472.66, 3096.43, 3274.65, 2832.33,
3587.96, 2749.68, 2826.06, 2994.97, 3716.02, 1818.68, 3152.86,
3221.07, 3425.32, 3067.17, 3768.08, 2812.19, 3779.1, 2696.86,
3011.5, 3730.84, 3452.3, 4110.65, 2654.11, 2894.65, 3929.58,
3852.82, 2842.59, 2874.13, 3369.27, 3537.23, 3616.46, 2374.62,
4057.45, 2607.94, 4312.81, 3009.41, 2804.21, 3322.91, 3634.89,
3336.97, 2886.86, 2931.13, 1909.5, 3503.03, 2006.97, 2485.01,
2910.42, 4531.88, 2551.89, 1662.5, 2990.98, 3540.84, 2938.16,
2657.91, 4274.24, 3140.32, 3744.33, 2806.49, 2524.53, 3311.51,
4121.48, 2350.3, 2866.34, 4018.5, 3291.75, 3649.52, 3109.54,
3500.94, 3542.17, 3612.66, 2899.4, 4104.57, 3526.78, 4146.75,
2300.33, 2562.91, 4213.63, 4100.01, 3145.83, 2939.87, 4007.29,
2846.58, 4704.59, 2711.11, 3873.15, 3187.82, 4595.15, 3081.99,
3104.6, 3413.54, 4192.35, 3727.99, 3541.22, 1597.71, 1307.39,
3863.65, 3120.75, 2696.29, 3094.53, 4412.56, 2998.01, 3245.77,
2754.05, 3197.13, 2867.48, 794.2), Store_2 = c(704.9, 1415.5,
1010.04, 1294.85, 1294.66, 1513.16, 1946.36, 1506.7, 1726.91,
1758.64, 2033.57, 1726.53, 1845.47, 1975.62, 1763.96, 1550.97,
1770.42, 1459.58, 1670.29, 1884.61, 1647.3, 1106.75, 1708.67,
1892.02, 1682.64, 1613.1, 1666.3, 1919.76, 1637.61, 1763.96,
1482.95, 1745.34, 1700.31, 1601.51, 1427.47, 1347.1, 2059.22,
1742.68, 1779.16, 1716.08, 1954.53, 1791.7, 1536.34, 1708.48,
1765.48, 1948.26, 1819.82, 1736.79, 1453.31, 2202.29, 2060.55,
2044.59, 1558.19, 1388.33, 1415.5, 1651.67, 1634.76, 1922.42,
1989.87, 1760.73, 1655.09, 1721.59, 1975.43, 1776.69, 2101.02,
1743.44, 1811.65, 1946.74, 1962.13, 1528.55, 1763.96, 2120.4,
1363.25, 1993.48, 1862.57, 1883.47, 2062.64, 2282.28, 1761.87,
2080.12, 1903.8, 2093.61, 1976.95, 1789.04, 1958.14, 1709.81,
1917.1, 1781.82, 2109.19, 1949.4, 1971.25, 2048.96, 1932.11,
2202.67, 1794.93, 2045.54, 1792.65, 2472.28, 1884.23, 2052.38,
1760.73, 2299.19, 2205.14, 2112.61, 1323.92, 1130.88, 1795.88,
1520.95, 1748, 1656.61, 2337.95, 1929.26, 1112.64, 1829.89, 2217.49,
2147.38, 2080.5, 2166.57, 2083.35, 2257.2, 2090.76, 1292.38,
1883.66, 2178.16, 1635.33, 2133.7, 1813.55, 1742.3, 2126.48,
2117.36, 1943.89, 2205.52, 2163.53, 2046.68, 1834.45, 2084.87,
1946.17, 1692.33, 1632.86, 2084.68, 1961.18, 2332.06, 2226.23,
2073.09, 1707.15, 2551.89, 2090, 2126.67, 2007.16, 2402.93, 2194.69,
2270.5, 2141.49, 2346.31, 2569.56, 2516.36, 648.66, 829.73, 2001.46,
1830.08, 1899.05, 1782.96, 2350.49, 2050.67, 2146.62, 1930.97,
2190.13, 1740.97, 481.08), Store_3 = c(118.37, 191.9, 115.71,
146.87, 181.45, 179.17, 190.38, 214.51, 215.65, 211.85, 216.41,
183.92, 212.99, 273.79, 195.51, 164.73, 182.59, 168.53, 182.02,
169.86, 165.68, 121.79, 179.36, 190.19, 183.16, 163.78, 224.01,
202.16, 163.21, 174.99, 175.56, 184.11, 189.24, 169.48, 167.96,
151.05, 200.83, 179.55, 209.95, 265.24, 201.78, 205.2, 207.29,
211.09, 170.62, 206.34, 184.49, 190.95, 174.61, 231.99, 207.1,
219.07, 246.24, 177.27, 174.04, 207.67, 193.8, 196.46, 218.88,
209.38, 213.94, 209.19, 238.83, 236.55, 234.46, 272.65, 221.73,
189.43, 195.51, 186.58, 192.47, 217.74, 139.46, 211.09, 210.33,
165.49, 184.68, 222.3, 244.72, 202.35, 191.71, 175.94, 155.42,
184.87, 175.94, 159.79, 179.17, 178.22, 193.99, 187.53, 223.44,
333.83, 205.01, 216.98, 180.88, 215.46, 195.7, 221.73, 201.78,
190.38, 209.38, 218.31, 212.8, 225.15, 231.23, 168.53, 205.77,
192.28, 207.86, 190.19, 238.45, 222.11, 119.51, 211.09, 240.73,
227.81, 216.6, 296.21, 210.71, 219.64, 187.72, 219.83, 180.69,
192.66, 182.4, 212.04, 205.01, 200.07, 245.86, 197.03, 259.54,
230.85, 212.99, 222.87, 200.64, 239.59, 178.22, 201.21, 186.58,
207.86, 214.32, 221.92, 196.27, 306.09, 197.03, 269.42, 205.01,
220.78, 228, 230.66, 220.78, 250.42, 230.28, 214.89, 265.62,
275.88, 61.75, 167.77, 219.45, 255.74, 240.16, 229.33, 251.75,
226.67, 276.64, 227.62, 232.18, 191.14, 54.34), Store_4 = c(548.53,
791.35, 981.35, 310.65, 918.46, 616.74, 1129.93, 271.32, 1079.39,
952.66, 1107.89, 406.41, 505.4, 1634.19, 729.22, 867.35, 533.71,
893.38, 1793.79, 1216.57, 755.44, 336.87, 1523.99, 969.19, 574.37,
617.88, 1303.21, 1253.62, 726.94, 935.37, 2129.9, 843.41, 1752.75,
574.94, 1670.48, 572.47, 1528.55, 534.09, 674.88, 2046.3, 867.73,
1132.21, 374.49, 1521.52, 734.16, 1354.51, 862.22, 967.1, 1022.58,
897.18, 577.79, 480.32, 1574.72, 726.37, 1117.58, 463.6, 1066.66,
1015.74, 1279.27, 436.24, 732.45, 1471.74, 1439.82, 459.8, 761.14,
1258.56, 798.76, 1451.79, 591.66, 1111.31, 1038.73, 1378.07,
315.97, 948.48, 1148.36, 1375.98, 819.85, 1263.5, 805.03, 1496.82,
601.35, 742.14, 1598.28, 1478.39, 1977.14, 784.13, 798.76, 1968.97,
1549.45, 706.04, 679.06, 986.67, 1399.92, 1197.19, 398.81, 1796.26,
619.21, 1618.99, 923.4, 561.45, 1353.18, 1117.39, 919.22, 548.91,
1375.79, 610.09, 1501.38, 293.36, 529.15, 1063.62, 1955.67, 400.52,
430.54, 950, 1082.24, 562.97, 361, 1811.65, 845.88, 1267.49,
528.39, 1012.32, 1246.97, 1750.28, 532.76, 520.41, 2000.32, 1349.38,
1277.18, 794.58, 1297.7, 1105.42, 1235.95, 630.23, 2069.48, 1202.32,
2022.36, 406.98, 743.47, 1921.28, 1924.32, 591.66, 516.61, 1627.92,
942.4, 1883.28, 416.1, 1525.7, 952.85, 1961.56, 666.9, 583.49,
1041.77, 1630.96, 893.19, 748.79, 887.49, 309.89, 1643.12, 1034.74,
557.27, 1082.62, 1810.51, 720.86, 822.51, 595.84, 775.01, 935.18,
258.78)), class = "data.frame", row.names = 154:322)
So my intention is to convert this data like example below (with pivot or other function).Main purpose of this is to compare data week by week with different years. So please can anybody help me with this problem ?
If the order of the columns is not that important, the following works well:
library(tidyr)
df %>%
separate(col=year_week, into=c("year","week"), sep=" ") %>%
pivot_wider(id_cols=week, names_from="year",
values_from=starts_with("Store"))
# A tibble: 52 x 17
week Store_1_2017 Store_1_2018 Store_1_2019 Store_1_2020 Store_2_2017 Store_2_2018 Store_2_2019
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 01 1372. 3380. 2931. 1598. 705. 1558. 1324.
2 02 2398. 2292. 1910. 1307. 1416. 1388. 1131.
3 03 2107. 2707. 3503. 3864. 1010. 1416. 1796.
4 04 1753. 2323. 2007. 3121. 1295. 1652. 1521.
5 05 2395. 2895. 2485. 2696. 1295. 1635. 1748
6 06 2309. 3135. 2910. 3095. 1513. 1922. 1657.
7 07 3267. 3488. 4532. 4413. 1946. 1990. 2338.
8 08 1993. 2407. 2552. 2998. 1507. 1761. 1929.
9 09 3022. 2601. 1662. 3246. 1727. 1655. 1113.
10 10 2923. 3402. 2991. 2754. 1759. 1722. 1830.
# ... with 42 more rows, and 9 more variables: Store_2_2020 <dbl>, Store_3_2017 <dbl>,
# Store_3_2018 <dbl>, Store_3_2019 <dbl>, Store_3_2020 <dbl>, Store_4_2017 <dbl>,
# Store_4_2018 <dbl>, Store_4_2019 <dbl>, Store_4_2020 <dbl>
You can use tidyverse package to solve your problem.
This transforms your Store_x variables into "long" format, make new variable that is a combination of year and store, then reshape it again to "wide" format.
# install.packages("tidyverse")
library(tidyverse)
FINAL_SALES %>%
separate(year_week, c("year","week"), sep = " ") %>%
pivot_longer(starts_with("Store")) %>%
mutate(name = paste(year, name)) %>%
select(-Year, -year) %>%
pivot_wider(names_from =name, values_from = value)
# # A tibble: 52 x 17
# week `2017 Store_1` `2017 Store_2` `2017 Store_3` `2017 Store_4`
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 01 1372. 705. 118. 549.
# 2 02 2398. 1416. 192. 791.
# 3 03 2107. 1010. 116. 981.
# 4 04 1753. 1295. 147. 311.
# 5 05 2395. 1295. 181. 918.
# 6 06 2309. 1513. 179. 617.
# ....
One approach would be to pivot longer then back to wider.
library(tidyverse)
FINAL_SALES %>%
mutate(Week = str_extract(FINAL_SALES$year_week, "[0-9]+$")) %>%
pivot_longer(cols = starts_with("Store"), names_to = "Store", values_to = "Value") %>%
select(-one_of("year_week")) %>%
pivot_wider(names_from = c(Year,Store), values_from = Value)
# A tibble: 52 x 17
Week `2017_Store_1` `2017_Store_2` `2017_Store_3` `2017_Store_4` `2018_Store_1` `2018_Store_2` `2018_Store_3` `2018_Store_4`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 01 1372. 705. 118. 549. 3380. 1558. 246. 1575.
2 02 2398. 1416. 192. 791. 2292. 1388. 177. 726.
3 03 2107. 1010. 116. 981. 2707. 1416. 174. 1118.
4 04 1753. 1295. 147. 311. 2323. 1652. 208. 464.
5 05 2395. 1295. 181. 918. 2895. 1635. 194. 1067.
6 06 2309. 1513. 179. 617. 3135. 1922. 196. 1016.
7 07 3267. 1946. 190. 1130. 3488. 1990. 219. 1279.
8 08 1993. 1507. 215. 271. 2407. 1761. 209. 436.
9 09 3022. 1727. 216. 1079. 2601. 1655. 214. 732.
10 10 2923. 1759. 212. 953. 3402. 1722. 209. 1472.
# … with 42 more rows, and 8 more variables: `2019_Store_1` <dbl>, `2019_Store_2` <dbl>, `2019_Store_3` <dbl>,
# `2019_Store_4` <dbl>, `2020_Store_1` <dbl>, `2020_Store_2` <dbl>, `2020_Store_3` <dbl>, `2020_Store_4` <dbl>
Base R solution:
# Transmute the data to contain a week vector and not the year_week vect:
clean_df <- within(FINAL_SALES, {
week <- gsub(".* ", "", sapply(year_week, as.character))
rm(year_week)
}
)
# Melt the dataframe:
melted_df <- reshape(clean_df,
direction = "long",
varying = names(clean_df[!(names(clean_df) %in% c("Year", "week"))]),
v.names=c("sales"),
idvar = c("Year", "week"),
timevar = "store",
times = names(clean_df[!(names(clean_df) %in% c("Year", "week"))]),
new.row.names = 1:(length(names(clean_df[!(names(clean_df) %in% c("Year", "week"))])) * nrow(clean_df))
)
# Pivot (spread) the dataframe:
cross_tab <- xtabs(sales~week+store+Year, melted_df)
You could also use the data.table package function dcast as follows:
library(data.table)
setDT(FINAL_SALES)[, Week := sub(".+ ", "", year_week)]
dcast(FINAL_SALES, Week ~ Year, value.var = grep("Store", names(FINAL_SALES), value = TRUE))
# Week Store_1_2017 Store_1_2018 Store_1_2019 Store_1_2020 ...
# 1: 01 1371.61 3379.72 2931.13 1597.71
# 2: 02 2398.37 2291.97 1909.50 1307.39
# 3: 03 2107.29 2707.31 3503.03 3863.65
# 4: 04 1752.56 2322.75 2006.97 3120.75
# 5: 05 2394.57 2895.41 2485.01 2696.29
# 6: 06 2309.07 3134.62 2910.42 3094.53
# 7: 07 3266.67 3487.83 4531.88 4412.56
# 8: 08 1993.10 2406.54 2551.89 2998.01
# 9: 09 3021.57 2601.10 1662.50 3245.77
# 10: 10 2922.96 3402.14 2990.98 2754.05 ...
# ...

How should I order a column with character values numerically [duplicate]

This question already has an answer here:
Order a "mixed" vector (numbers with letters)
(1 answer)
Closed 2 years ago.
If i have a matrix that looks like this:
Region Ålder Antal regpop Andel
[1,] "01 Stockholms län" "0 år" "28474" "2377081" "0.0119785568939384"
[2,] "01 Stockholms län" "1 år" "29033" "2377081" "0.0122137192632477"
[3,] "01 Stockholms län" "10 år" "29678" "2377081" "0.0124850604586045"
[4,] "01 Stockholms län" "100+ år" "524" "2377081" "0.000220438428475933"
[5,] "01 Stockholms län" "11 år" "29679" "2377081" "0.0124854811426283"
[6,] "01 Stockholms län" "12 år" "28956" "2377081" "0.0121813265934144"
[7,] "01 Stockholms län" "13 år" "28592" "2377081" "0.0120281976087479"
[8,] "01 Stockholms län" "14 år" "27572" "2377081" "0.0115990999044627"
[9,] "01 Stockholms län" "15 år" "27466" "2377081" "0.0115545073979389"
[10,] "01 Stockholms län" "16 år" "26691" "2377081" "0.0112284772794869"
[11,] "01 Stockholms län" "17 år" "26004" "2377081" "0.0109394673551301"
[12,] "01 Stockholms län" "18 år" "24996" "2377081" "0.0105154178591306"
[13,] "01 Stockholms län" "19 år" "24971" "2377081" "0.0105049007585354"
[14,] "01 Stockholms län" "2 år" "29268" "2377081" "0.0123125800088428"
[15,] "01 Stockholms län" "20 år" "24777" "2377081" "0.0104232880579164"
what should i do to order them from "0 år", "1 år", "2 år" ....."100+ år"
The gtools::mixedsort function can help here
x <- c("0 år", "1 år", "10 år", "100+ år", "11 år", "12 år", "13 år",
"14 år", "15 år", "16 år", "17 år", "18 år", "19 år", "2 år",
"20 år")
gtools::mixedsort(x)
# [1] "0 år" "1 år" "2 år" "10 år" "11 år" "12 år" "13 år" "14 år"
# [9] "15 år" "16 år" "17 år" "18 år" "19 år" "20 år" "100+ år"
If the object you shared is a matrix named data, then you could do
data[gtools::mixedorder(data[["Ålder"]]),]
Here is a base R option. Extract the digits from the string using gsub and then convert to a numeric allowing us to reorder numerically. I have created a mock matrix just as an example. Perhaps if you could provide the actual matrix by using dput i.e. dput(matrix) then paste the output into your question, that would help people give more specific answers
x <- c("0 år", "1 år", "10 år", "100+ år", "11 år", "12 år", "13 år",
"14 år", "15 år", "16 år", "17 år", "18 år", "19 år", "2 år",
"20 år")
y <- 1:length(x)
mat <- matrix(c(x, y), ncol = 2)
mat[order(as.numeric(gsub("[^0-9.]", "", mat[,1]))),]

Renaming labels of a factor in R

I have census data of Male and Female populations organizaed by age group:
library(tidyverse)
url <- "https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/counties/asrh/cc-est2018-alldata-54.csv"
if (!file.exists("./datafiles/cc-est2018-alldata-54.csv"))
download.file(url, destfile = "./datafiles/cc-est2018-alldata-54.csv", mode = "wb")
popSample <- read.csv("./datafiles/cc-est2018-alldata-54.csv") %>%
filter(AGEGRP != 0 & YEAR == 1) %>%
select("STNAME", "CTYNAME", "AGEGRP", "TOT_POP", "TOT_MALE", "TOT_FEMALE")
popSample$AGEGRP <- as.factor(popSample$AGEGRP)
I then plot the Male and Female population relationships, faceted by age group (1-18, which is currently treated as a int
g <- ggplot(popSample, aes(x=TOT_MALE, y=TOT_FEMALE)) +
geom_point(alpha = 0.5, colour="darkblue") +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~AGEGRP) +
stat_smooth(method = "lm", col = "darkred", size=.75) +
labs(title = "F vs. M Population across all Age Groups", x = "Total Male (log10)", y = "Total Female (log10)") +
theme_light()
g
Which results in this plot: https://share.getcloudapp.com/v1ur6O4e
The problem: I am trying to convert the column AGEGRP from ‘int’ to ‘factor’, and change the factors labels from “1”, “2”, “3”, … “18” to "AgeGroup1", "AgeGroup2", "AgeGroup3", … "AgeGroup18"
When I try this code, my AGEGRP column's observation values are all replaced with NAs:popSample$AGEGRP <- factor(popSample$AGEGRP, levels = c("0 to 4", "5 to 9", "10 to 14", "15 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85+"))
https://share.getcloudapp.com/qGuo1O4y
Thank you for your help,
popSample$AGEGRP <- factor( popSample$AGEGRP, levels = c("0 to 4", "5 to 9", "10 to 14", "15 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85+"))
Need to add all levels though.
Alternatively
levels(popSample$AGEGRP) <- c("0 to 4", "5 to 9", "10 to 14", "15 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85+")
should work as well.
Read in the csv again:
library(tidyverse)
url <- "https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/counties/asrh/cc-est2018-alldata-54.csv"
popSample <- read.csv(url) %>%
filter(AGEGRP != 0 & YEAR == 1) %>%
select("STNAME", "CTYNAME", "AGEGRP", "TOT_POP", "TOT_MALE", "TOT_FEMALE")
If you just want to add a prefix "AgeGroup" to your facet labels, you do:
ggplot(popSample, aes(x=TOT_MALE, y=TOT_FEMALE)) +
geom_point(alpha = 0.5, colour="darkblue") +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~AGEGRP,labeller=labeller(AGEGRP = function(i)paste0("AgeGroup",i))) +
stat_smooth(method = "lm", col = "darkred", size=.75) +
labs(title = "F vs. M Population across all Age Groups",
x = "Total Male (log10)", y = "Total Female (log10)") +
theme_light()
If there is a need for new factors, then you need to refactor (like #Annet's answer below):
lvls = c("0 to 4", "5 to 9", "10 to 14", "15 to 19",
"20 to 24", "25 to 29", "30 to 34", "35 to 39",
"40 to 44", "45 to 49", "50 to 54", "55 to 59",
"60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85+")
#because you have factorize it
# if you can read the csv again, skip the factorization
popSample$AGEGRP = factor(lvls[popSample$AGEGRP],levels=lvls)
Then plot:
ggplot(popSample, aes(x=TOT_MALE, y=TOT_FEMALE)) +
geom_point(alpha = 0.5, colour="darkblue") +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~AGEGRP) +
stat_smooth(method = "lm", col = "darkred", size=.75) +
labs(title = "F vs. M Population across all Age Groups",
x = "Total Male (log10)", y = "Total Female (log10)") +
theme_light()
To change all the factor labels with one function, you can use forcats::fct_relabel (forcats ships as part of the tidyverse, which you've already got loaded). The changed factor labels will carry over to the plot facets and the order stays the same.
First few entries:
# before relabelling
popSample$AGEGRP[1:4]
#> [1] 1 2 3 4
#> Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
# after relabelling
forcats::fct_relabel(popSample$AGEGRP, ~paste0("AgeGroup", .))[1:4]
#> [1] AgeGroup1 AgeGroup2 AgeGroup3 AgeGroup4
#> 18 Levels: AgeGroup1 AgeGroup2 AgeGroup3 AgeGroup4 AgeGroup5 ... AgeGroup18
Or with base R, reassign the levels:
levels(popSample$AGEGRP) <- paste0("AgeGroup", levels(popSample$AGEGRP))
popSample$AGEGRP[1:4]
#> [1] AgeGroup1 AgeGroup2 AgeGroup3 AgeGroup4
#> 18 Levels: AgeGroup1 AgeGroup2 AgeGroup3 AgeGroup4 AgeGroup5 ... AgeGroup18

Combine two data frames by one variable and combining columns under one main header

I want to combine two dataframes T2 and T4 by variable "Industry" and the columns of each data set with one main heading. So in the final output table I want columns Industry, three columns of T2 under one column heading "Executive" and three other columns of T4 as sub-columns of one heading "management".
T2
Industry percentage_Yes percentage_No Total_responses
1 ALL 94 % 6 % 117
2 Banking/Financial Services 83 % 17 % 6
3 Chemicals 100 % 0 % 5
4 Consumer Goods 75 % 25 % 8
5 Energy 89 % 11 % 9
6 High Tech 100 % 0 % 8
7 Insurance/Reinsurance 100 % 0 % 14
8 Life Sciences 100 % 0 % 11
9 Logistics -- -- 3
10 Mining & Metals -- -- 1
11 Other Manufacturing 100 % 0 % 11
12 Other Non-Manufacturing -- -- 3
13 Retail & Wholesale 100 % 0 % 12
14 Services (Non-Financial) 88 % 12 % 24
15 Transportation Equipment -- -- 2
16 <NA> -- -- 0
T4
Industry percentage_Yes percentage_No Total_responses
1 ALL 96 % 4 % 121
2 Banking/Financial Services 86 % 14 % 7
3 Chemicals 100 % 0 % 5
4 Consumer Goods 100 % 0 % 8
5 Energy 100 % 0 % 9
6 High Tech 100 % 0 % 9
7 Insurance/Reinsurance 93 % 7 % 15
8 Life Sciences 91 % 9 % 11
9 Logistics -- -- 3
10 Mining & Metals -- -- 1
11 Other Manufacturing 100 % 0 % 12
12 Other Non-Manufacturing -- -- 3
13 Retail & Wholesale 100 % 0 % 12
14 Services (Non-Financial) 92 % 8 % 24
15 Transportation Equipment -- -- 2
16 <NA> -- -- 0
> dput(T2)
structure(list(Industry = c("ALL", "Banking/Financial Services",
"Chemicals", "Consumer Goods", "Energy", "High Tech", "Insurance/Reinsurance",
"Life Sciences", "Logistics", "Mining & Metals", "Other Manufacturing",
"Other Non-Manufacturing", "Retail & Wholesale", "Services (Non-Financial)",
"Transportation Equipment", NA), percentage_Yes = c("94 %", "83 %",
"100 %", "75 %", "89 %", "100 %", "100 %", "100 %", "--", "--",
"100 %", "--", "100 %", "88 %", "--", "--"), percentage_No = c("6 %",
"17 %", "0 %", "25 %", "11 %", "0 %", "0 %", "0 %", "--", "--",
"0 %", "--", "0 %", "12 %", "--", "--"), Total_responses = c(117,
6, 5, 8, 9, 8, 14, 11, 3, 1, 11, 3, 12, 24, 2, 0)), class = "data.frame", row.names = c(NA,
-16L), .Names = c("Industry", "percentage_Yes", "percentage_No",
"Total_responses"))
> dput(T4)
structure(list(Industry = c("ALL", "Banking/Financial Services",
"Chemicals", "Consumer Goods", "Energy", "High Tech", "Insurance/Reinsurance",
"Life Sciences", "Logistics", "Mining & Metals", "Other Manufacturing",
"Other Non-Manufacturing", "Retail & Wholesale", "Services (Non-Financial)",
"Transportation Equipment", NA), percentage_Yes = c("96 %", "86 %",
"100 %", "100 %", "100 %", "100 %", "93 %", "91 %", "--", "--",
"100 %", "--", "100 %", "92 %", "--", "--"), percentage_No = c("4 %",
"14 %", "0 %", "0 %", "0 %", "0 %", "7 %", "9 %", "--", "--",
"0 %", "--", "0 %", "8 %", "--", "--"), Total_responses = c(121,
7, 5, 8, 9, 9, 15, 11, 3, 1, 12, 3, 12, 24, 2, 0)), class = "data.frame", row.names = c(NA,
-16L), .Names = c("Industry", "percentage_Yes", "percentage_No",
"Total_responses"))
I have tried tabular but then m getting Industry column 2 times:
library("tables")
st<-rbind(data.frame(T2, Employee_Level = 'Exe', what = factor(rownames(T2), levels = rownames(T2)),
row.names= NULL, check.names = FALSE),
data.frame(T4,Employee_Level = 'Mgmt',what = factor(rownames(T4), levels = rownames(T4)),
row.names = NULL,check.names = FALSE))
mytable <- tabular(Heading()*what ~ Employee_Level*(`Industry`+`percentage_Yes`+`percentage_No`+`Total_responses`)*Heading()*(identity),data=st)
latex(mytable)
Here's one way using (my) huxtable package:
library(huxtable)
my_data <- cbind(T2, T4)[, c(1:4, 6:8)]
my_hux <- as_hux(my_data, add_colnames = TRUE)
my_hux <- insert_row(my_hux, rep("", 7))
my_hux[1, 2] <- "Executive"
my_hux[1, 5] <- "Management"
colspan(my_hux)[1, 2] <- 3
colspan(my_hux)[1, 5] <- 3
my_hux[2, 2:7] <- rep(c("% yes", "% no", "Total responses"), 2)
number_format(my_hux) <- 0
# This should look like what you want:
my_hux

Convert a date range to Date type in R

This vector of date ranges is included in a dataframe of mine with class 'character'. The formats vary depending on whether the date range crosses into a different month:
dput(pollingdata$dates)
c("Nov. 1-7", "Nov. 1-7", "Oct. 24-Nov. 6", "Oct. 4-Nov. 6",
"Oct. 30-Nov. 6", "Oct. 25-31", "Oct. 7-27", "Oct. 21-Nov. 3",
"Oct. 20-24", "Jul. 19", "Oct. 29-Nov. 4", "Oct. 28-Nov. 3",
"Oct. 27-Nov. 2", "Oct. 20-28", "Sep. 30-Oct. 20", "Oct. 15-19",
"Oct. 26-Nov. 1", "Oct. 25-31", "Oct. 24-30", "Oct. 18-26",
"Oct. 10-14", "Oct. 4-9", "Sep. 23-Oct. 6", "Sep. 16-29", "Sep. 2-22",
"Oct. 21-Nov. 2", "Oct. 17-25", "Sep. 30-Oct. 13", "Sep. 27-Oct. 3",
"Sep. 21-26", "Sep. 14-20", "Aug. 26-Sep. 15", "Sep. 7-13",
"Aug. 19-Sep. 8", "Aug. 31-Sep. 6", "Aug. 12-Sep. 1", "Aug. 9-Sep. 1",
"Aug. 24-30", "Aug. 5-25", "Aug. 17-23", "Jul. 29-Aug. 18",
"Aug. 10-16", "Jan. 12")
I would like to convert this vector into two separate columns in my dataframe, 1. startdate and 2. enddate, for the beginning and end of the range. Both columns should be saved as class 'Date', this will make it easier for me to use the data in my project. Does anyone know an easy way to do this manipulation? I have been struggling with it.
Thanks in advance,
We can split the vector by - into a list, replace the elements that have only numbers at the end by pasteing the month substring, append NA for those having less than 2 elements using (length<-) and convert to data.frame (with do.call(rbind.data.frame)
lst <- lapply(strsplit(v1, "-"), function(x) {
i1 <- grepl("^[0-9]+", x[length(x)])
if(i1) {
x[length(x)] <- paste(substr(x[1], 1, 4), x[length(x)])
x} else x})
d1 <- do.call(rbind.data.frame, lapply(lst, `length<-`, max(lengths(lst))))
colnames(d1) <- c("Start_Date", "End_Date")
As per the OP's post, we need to convert to Date class, but Date class follows the format of %Y-%m-%d. In the vector, there is no year, not sure we can paste the current year and convert to Date class. If that is permissible, then
d1[] <- lapply(d1, function(x) as.Date(paste(x, 2017), "%b. %d %Y"))
head(d1)
# Start_Date End_Date
#1 2017-11-01 2017-11-07
#2 2017-11-01 2017-11-07
#3 2017-10-24 2017-11-06
#4 2017-10-04 2017-11-06
#5 2017-10-30 2017-11-06
#6 2017-10-25 2017-10-31
You may use library stringr function "str_split_fixed" to split the fields and then process the data. Map the library stringr and process as below:
library(stringr)
dat <- data.frame(date=c("Nov. 1-7", "Nov. 1-7", "Oct. 24-Nov. 6", "Oct. 4-Nov. 6",
"Oct. 30-Nov. 6", "Oct. 25-31", "Oct. 7-27", "Oct. 21-Nov. 3",
"Oct. 20-24", "Jul. 19", "Oct. 29-Nov. 4", "Oct. 28-Nov. 3",
"Oct. 27-Nov. 2", "Oct. 20-28", "Sep. 30-Oct. 20", "Oct. 15-19",
"Oct. 26-Nov. 1", "Oct. 25-31", "Oct. 24-30", "Oct. 18-26",
"Oct. 10-14", "Oct. 4-9", "Sep. 23-Oct. 6", "Sep. 16-29", "Sep. 2-22",
"Oct. 21-Nov. 2", "Oct. 17-25", "Sep. 30-Oct. 13", "Sep. 27-Oct. 3",
"Sep. 21-26", "Sep. 14-20", "Aug. 26-Sep. 15", "Sep. 7-13",
"Aug. 19-Sep. 8", "Aug. 31-Sep. 6", "Aug. 12-Sep. 1", "Aug. 9-Sep. 1",
"Aug. 24-30", "Aug. 5-25", "Aug. 17-23", "Jul. 29-Aug. 18",
"Aug. 10-16", "Jan. 12"))
Output processing:
#spliting with space and dash
dt <- data.frame(str_split_fixed(dat$date, "[-]|\\s",4))
names(dt) <- c("stdt1","stdt2","endt1","endt2")
##Removing dot(.) and replacing with ""
dt1 <- data.frame(sapply(dt,function(x)gsub("[.]","",x)))
dt1$stdt <- as.Date(paste0(dt1$stdt2,dt1$stdt1,"2016"),format="%d%b%Y")
dt1$endt <- ifelse(dt1$endt2=="",paste0(dt1$endt1,dt1$stdt1,"2016"),
paste0(dt1$endt2,dt1$endt1,"2016"))
dt1$endt <-as.Date(ifelse(nchar(dt1$endt)==7,paste0(dt1$stdt2,dt1$endt),dt1$endt),"%d%b%Y")
Assumptions:
1) No year provided , hence I have taken year as 2016.
2) On 10th row and 43rd row, there is no info on end date "day",hence I have assumed the same day as start date.
Answer:
> dt1
stdt1 stdt2 endt1 endt2 stdt endt
1 Nov 1 7 2016-11-01 2016-11-07
2 Nov 1 7 2016-11-01 2016-11-07
3 Oct 24 Nov 6 2016-10-24 2016-11-06
4 Oct 4 Nov 6 2016-10-04 2016-11-06
5 Oct 30 Nov 6 2016-10-30 2016-11-06
6 Oct 25 31 2016-10-25 2016-10-31
7 Oct 7 27 2016-10-07 2016-10-27
8 Oct 21 Nov 3 2016-10-21 2016-11-03
9 Oct 20 24 2016-10-20 2016-10-24
10 Jul 19 2016-07-19 2016-07-19

Resources