Sum specific row in R - without character & boolean columns - r

I have a data frame loaded in R and I need to sum one row. The problem is that I've tried to use rowSums() function, but 2 columns are not numeric ones (one is character "Nazwa" and one is boolean "X" at the end of data frame). Is there any option to sum this row without those two columns? So I'd like to start from row 1, column 3 and don't include last column.
My data:
structure(list(Kod = c(0L, 200000L, 400000L, 600000L, 800000L,
1000000L), Nazwa = c("POLSKA", "DOLNOŚLĄSKIE", "KUJAWSKO-POMORSKIE",
"LUBELSKIE", "LUBUSKIE", "ŁÓDZKIE"), gospodarstwa.ogółem.gospodarstwa.2006.... = c(9187L,
481L, 173L, 1072L, 256L, 218L), gospodarstwa.ogółem.gospodarstwa.2007.... = c(11870L,
652L, 217L, 1402L, 361L, 261L), gospodarstwa.ogółem.gospodarstwa.2008.... = c(14896L,
879L, 258L, 1566L, 480L, 314L), gospodarstwa.ogółem.gospodarstwa.2009.... = c(17091L,
1021L, 279L, 1710L, 579L, 366L), gospodarstwa.ogółem.gospodarstwa.2010.... = c(20582L,
1227L, 327L, 1962L, 833L, 420L), gospodarstwa.ogółem.gospodarstwa.2011.... = c(23449L,
1322L, 371L, 2065L, 1081L, 478L), gospodarstwa.ogółem.gospodarstwa.2012.... = c(25944L,
1312L, 390L, 2174L, 1356L, 518L), gospodarstwa.ogółem.gospodarstwa.2013.... = c(26598L,
1189L, 415L, 2129L, 1422L, 528L), gospodarstwa.ogółem.gospodarstwa.2014.... = c(24829L,
1046L, 401L, 1975L, 1370L, 508L), gospodarstwa.ogółem.gospodarstwa.2015.... = c(22277L,
849L, 363L, 1825L, 1202L, 478L), gospodarstwa.ogółem.gospodarstwa.2016.... = c(22435L,
813L, 470L, 1980L, 1148L, 497L), gospodarstwa.ogółem.gospodarstwa.2017.... = c(20257L,
741L, 419L, 1904L, 948L, 477L), gospodarstwa.ogółem.gospodarstwa.2018.... = c(19207L,
713L, 395L, 1948L, 877L, 491L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2006..ha. = c(228038L,
19332L, 4846L, 19957L, 12094L, 3378L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2007..ha. = c(287529L,
21988L, 5884L, 23934L, 18201L, 3561L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2008..ha. = c(314848L,
28467L, 5943L, 26892L, 18207L, 4829L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2009..ha. = c(367062L,
26427L, 6826L, 30113L, 22929L, 5270L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2010..ha. = c(519069L,
39703L, 7688L, 34855L, 35797L, 7671L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2011..ha. = c(605520L,
45547L, 8376L, 34837L, 44259L, 8746L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2012..ha. = c(661688L,
44304L, 8813L, 37466L, 52581L, 9908L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2013..ha. = c(669970L,
37455L, 11152L, 40819L, 54692L, 10342L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2014..ha. = c(657902L,
37005L, 11573L, 38467L, 53300L, 11229L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2015..ha. = c(580730L,
31261L, 10645L, 34052L, 46343L, 10158L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2016..ha. = c(536579L,
29200L, 9263L, 31343L, 43235L, 9986L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2017..ha. = c(494978L,
27542L, 8331L, 29001L, 37923L, 9260L), gospodarstwa.ogółem.powierzchnia.użytków.rolnych.2018..ha. = c(484677L,
27357L, 7655L, 28428L, 37174L, 8905L), X = c(NA, NA, NA, NA,
NA, NA)), row.names = c(NA, 6L), class = "data.frame")
My attempt:
rowSums(dane_csv[, 3:length(dane_csv$Nazwa=='POLSKA')])

Using base R
rowSums(dane_csv[sapply(dane_csv, is.numeric)])
-output
1 2 3 4 5 6
6667212 627833 511473 1033876 1288648 1108797
Or with dplyr
library(dplyr)
dane_csv %>%
transmute(out = rowSums(across(where(is.numeric))))

in base R use Filter function, to select the numeric columns then do a rowSums on them
rowSums(Filter(is.numeric, df))
1 2 3 4 5 6
6667212 627833 511473 1033876 1288648 1108797

You can select only the numeric columns:
library(dplyr)
df %>%
select(where(is.numeric)) %>%
rowSums() %>%
first()
Result:
1
6667212

Related

Remove middle inconsistent characters from a column header column name with r

`
set.seed(500)
index <- sample(1:nrow(Bands_reflectance_2017),100, replace = FALSE )
Bands_reflectance_2017 <- dput(head(Bands_reflectance_2017[1:100]))
Bands_reflectance_2017 <-
structure(
list(
t2017.01.05T08.25.12.000000000_blue = c(5064L,
5096L, 5072L, 5048L, 5048L, 5064L),
t2017.01.15T08.26.22.000000000_blue = c(418L,
487L, 480L, 449L, 449L, 480L),
t2017.01.25T08.21.38.000000000_blue = c(312L,
414L, 385L, 385L, 385L, 403L),
t2017.02.04T08.27.09.000000000_blue = c(5156L,
5096L, 5204L, 5240L, 5240L, 5112L),
t2017.02.14T08.27.29.000000000_blue = c(2554L,
2896L, 2842L, 2776L, 2776L, 2934L),
t2017.02.24T08.23.38.000000000_blue = c(2662L,
2428L, 2630L, 2644L, 2644L, 2276L),
t2017.03.06T08.24.47.000000000_blue = c(340L,
403L, 409L, 407L, 407L, 391L),
t2017.03.16T08.16.07.000000000_blue = c(188L,
245L, 257L, 239L, 239L, 245L),
t2017.03.26T08.22.43.000000000_blue = c(379L,
397L, 381L, 345L, 345L, 387L),
t2017.04.05T08.23.06.000000000_blue = c(604L,
647L, 639L, 647L, 647L, 631L),
t2017.04.15T08.23.45.000000000_blue = c(311L,
382L, 376L, 379L, 379L, 425L),
t2017.04.25T08.23.17.000000000_blue = c(219L,
318L, 237L, 322L, 322L, 302L),
t2017.05.05T08.23.45.000000000_blue = c(979L,
1030L, 1021L, 1030L, 1030L, 985L),
t2017.05.15T08.28.11.000000000_blue = c(138L,
219L, 196L, 201L, 201L, 247L),
t2017.05.25T08.23.46.000000000_blue = c(655L,
779L, 736L, 752L, 752L, 777L),
t2017.06.04T08.25.50.000000000_blue = c(318L,
419L, 384L, 343L, 343L, 400L),
t2017.06.14T08.28.06.000000000_blue = c(397L,
387L, 407L, 432L, 432L, 347L),
t2017.06.24T08.26.00.000000000_blue = c(336L,
450L, 402L, 395L, 395L, 388L),
t2017.07.04T08.23.42.000000000_blue = c(502L,
538L, 512L, 495L, 495L, 505L),
t2017.07.09T08.23.09.000000000_blue = c(568L,
597L, 639L, 611L, 611L, 577L),
t2017.07.19T08.23.43.000000000_blue = c(479L,
517L, 536L, 529L, 529L, 528L),
t2017.07.24T08.23.44.000000000_blue = c(409L,
499L, 499L, 473L, 473L, 482L),
t2017.07.29T08.26.12.000000000_blue = c(781L,
801L, 810L, 823L, 823L, 735L),
t2017.08.03T08.26.43.000000000_blue = c(517L,
579L, 560L, 583L, 583L, 564L),
t2017.08.08T08.23.41.000000000_blue = c(575L,
654L, 650L, 650L, 650L, 602L),
t2017.08.13T08.23.44.000000000_blue = c(623L,
679L, 708L, 698L, 698L, 677L),
t2017.08.18T08.25.16.000000000_blue = c(614L,
651L, 648L, 597L, 597L, 651L),
t2017.08.23T08.22.22.000000000_blue = c(554L,
613L, 559L, 524L, 524L, 596L),
t2017.08.28T08.28.01.000000000_blue = c(769L,
814L, 772L, 744L, 744L, 828L),
t2017.09.02T08.23.42.000000000_blue = c(756L,
761L, 763L, 783L, 783L, 742L),
t2017.09.07T08.23.30.000000000_blue = c(807L,
865L, 826L, 838L, 838L, 837L),
t2017.09.12T08.23.35.000000000_blue = c(861L,
869L, 876L, 904L, 904L, 869L),
t2017.09.22T08.23.38.000000000_blue = c(4640L,
3780L, 4340L, 4728L, 4728L, 3060L),
t2017.09.27T08.16.41.000000000_blue = c(778L,
777L, 811L, 839L, 839L, 752L),
t2017.10.02T08.17.41.000000000_blue = c(766L,
868L, 851L, 857L, 857L, 799L),
t2017.10.07T08.24.51.000000000_blue = c(767L,
816L, 839L, 830L, 830L, 753L),
t2017.10.12T08.24.39.000000000_blue = c(678L,
688L, 706L, 750L, 750L, 627L),
t2017.10.17T08.15.32.000000000_blue = c(678L,
769L, 804L, 797L, 797L, 711L),
t2017.10.22T08.21.34.000000000_blue = c(3146L,
3134L, 3128L, 3160L, 3160L, 3118L),
t2017.10.27T08.23.27.000000000_blue = c(612L,
697L, 721L, 697L, 697L, 708L),
t2017.11.01T08.24.41.000000000_blue = c(941L,
982L, 1001L, 1010L, 1010L, 999L),
t2017.11.06T08.20.50.000000000_blue = c(670L,
824L, 836L, 824L, 824L, 785L),
t2017.11.11T08.27.40.000000000_blue = c(720L,
817L, 839L, 807L, 807L, 801L),
t2017.11.16T08.16.16.000000000_blue = c(9824L,
9744L, 9792L, 9744L, 9744L, 9536L),
t2017.11.21T08.17.00.000000000_blue = c(749L,
841L, 838L, 738L, 738L, 830L),
t2017.11.26T08.25.13.000000000_blue = c(735L,
863L, 832L, 713L, 713L, 899L),
t2017.12.01T08.20.22.000000000_blue = c(674L,
836L, 816L, 800L, 800L, 771L),
t2017.12.06T08.19.42.000000000_blue = c(2742L,
2770L, 2742L, 2762L, 2762L, 2798L),
t2017.12.11T08.19.00.000000000_blue = c(582L,
745L, 734L, 654L, 654L, 743L),
t2017.12.16T08.23.19.000000000_blue = c(926L,
1054L, 1001L, 946L, 946L, 1054L),
t2017.12.21T08.20.53.000000000_blue = c(7432L,
7484L, 7456L, 7404L, 7404L, 7484L),
t2017.12.26T08.20.39.000000000_blue = c(629L,
724L, 762L, 738L, 738L, 731L),
t2017.12.31T08.20.04.000000000_blue = c(667L,
765L, 762L, 718L, 718L, 765L),
t2017.01.05T08.25.12.000000000_green = c(5224L,
5196L, 5208L, 5152L, 5152L, 5172L),
t2017.01.15T08.26.22.000000000_green = c(837L,
938L, 907L, 858L, 858L, 927L),
t2017.01.25T08.21.38.000000000_green = c(735L,
808L, 770L, 770L, 770L, 836L),
t2017.02.04T08.27.09.000000000_green = c(5424L,
5492L, 5488L, 5536L, 5536L, 5832L),
t2017.02.14T08.27.29.000000000_green = c(3050L,
3094L, 3108L, 3228L, 3228L, 2900L),
t2017.02.24T08.23.38.000000000_green = c(2664L,
2450L, 2598L, 2646L, 2646L, 2340L),
t2017.03.06T08.24.47.000000000_green = c(702L,
735L, 749L, 727L, 727L, 729L),
t2017.03.16T08.16.07.000000000_green = c(632L,
685L, 708L, 685L, 685L, 703L),
t2017.03.26T08.22.43.000000000_green = c(744L,
841L, 806L, 809L, 809L, 818L),
t2017.04.05T08.23.06.000000000_green = c(1030L,
1036L, 1044L, 1050L, 1050L, 1040L),
t2017.04.15T08.23.45.000000000_green = c(634L,
720L, 708L, 699L, 699L, 751L),
t2017.04.25T08.23.17.000000000_green = c(619L,
698L, 716L, 723L, 723L, 687L),
t2017.05.05T08.23.45.000000000_green = c(1340L,
1368L, 1374L, 1404L, 1404L, 1354L),
t2017.05.15T08.28.11.000000000_green = c(525L,
633L, 619L, 612L, 612L, 626L),
t2017.05.25T08.23.46.000000000_green = c(1042L,
1118L, 1078L, 1028L, 1028L, 1148L),
t2017.06.04T08.25.50.000000000_green = c(655L,
778L, 783L, 769L, 769L, 813L),
t2017.06.14T08.28.06.000000000_green = c(772L,
829L, 838L, 810L, 810L, 822L),
t2017.06.24T08.26.00.000000000_green = c(741L,
888L, 848L, 798L, 798L, 865L),
t2017.07.04T08.23.42.000000000_green = c(867L,
918L, 912L, 846L, 846L, 946L),
t2017.07.09T08.23.09.000000000_green = c(936L,
1001L, 1012L, 972L, 972L, 985L),
t2017.07.19T08.23.43.000000000_green = c(848L,
911L, 925L, 915L, 915L, 903L),
t2017.07.24T08.23.44.000000000_green = c(855L,
907L, 947L, 913L, 913L, 937L),
t2017.07.29T08.26.12.000000000_green = c(1096L,
1106L, 1134L, 1150L, 1150L, 1116L),
t2017.08.03T08.26.43.000000000_green = c(987L,
1072L, 1040L, 1030L, 1030L, 1021L),
t2017.08.08T08.23.41.000000000_green = c(996L,
1011L, 1001L, 1011L, 1011L, 1032L),
t2017.08.13T08.23.44.000000000_green = c(1006L,
1100L, 1082L, 1078L, 1078L, 1092L),
t2017.08.18T08.25.16.000000000_green = c(977L,
1034L, 1032L, 976L, 976L, 1020L),
t2017.08.23T08.22.22.000000000_green = c(976L,
1054L, 1044L, 985L, 985L, 1072L),
t2017.08.28T08.28.01.000000000_green = c(1162L,
1176L, 1188L, 1150L, 1150L, 1200L),
t2017.09.02T08.23.42.000000000_green = c(1136L,
1152L, 1158L, 1176L, 1176L, 1130L),
t2017.09.07T08.23.30.000000000_green = c(1122L,
1166L, 1174L, 1194L, 1194L, 1162L),
t2017.09.12T08.23.35.000000000_green = c(1158L,
1170L, 1168L, 1180L, 1180L, 1146L),
t2017.09.22T08.23.38.000000000_green = c(3304L,
3218L, 3072L, 3580L, 3580L, 4148L),
t2017.09.27T08.16.41.000000000_green = c(1172L,
1228L, 1242L, 1224L, 1224L, 1172L),
t2017.10.02T08.17.41.000000000_green = c(1148L,
1224L, 1220L, 1200L, 1200L, 1164L),
t2017.10.07T08.24.51.000000000_green = c(1120L,
1164L, 1160L, 1148L, 1148L, 1114L),
t2017.10.12T08.24.39.000000000_green = c(1124L,
1158L, 1166L, 1144L, 1144L, 1090L),
t2017.10.17T08.15.32.000000000_green = c(1092L,
1190L, 1180L, 1154L, 1154L, 1146L),
t2017.10.22T08.21.34.000000000_green = c(3140L,
3124L, 3142L, 3134L, 3134L, 3096L),
t2017.10.27T08.23.27.000000000_green = c(1064L,
1104L, 1116L, 1078L, 1078L, 1098L),
t2017.11.01T08.24.41.000000000_green = c(1298L,
1310L, 1344L, 1344L, 1344L, 1318L),
t2017.11.06T08.20.50.000000000_green = c(1114L,
1240L, 1220L, 1164L, 1164L, 1212L),
t2017.11.11T08.27.40.000000000_green = c(1182L,1278L, 1278L, 1192L, 1192L, 1284L),
t2017.11.16T08.16.16.000000000_green = c(8872L, 8728L, 8816L, 8904L, 8904L, 8600L),
t2017.11.21T08.17.00.000000000_green = c(1166L, 1268L, 1250L, 1158L, 1158L, 1260L),
t2017.11.26T08.25.13.000000000_green = c(1138L, 1272L, 1288L, 1240L, 1240L, 1278L)), row.names = c(NA, 6L), class = "data.frame")
`
I have a dataframe of dates for per specific bands with 534 column headers as follow:
"t2017-12-31T08:20:04.000000000_red_edge_3"
"t2017-02-04T08:27:09.000000000_nir_1"
"t2017-12-31T08:20:04.000000000_swir_2"
Now, I want to remove everything and only remain with the date and the band name e.g in column header one and two, I want to only remain with
"2017-12-31_red_edge_3"
"2017-02-04_nir_1"
I have about 534 columns and most characters are not consistent because each date time is different and more band examples not similar to what is shown here for all the 534 records, so I was only able to remove repetitive characters such as "T08", ":","t" and "000000000" which are available in all the columns. How do I remove the values between the date and the band characters when they vary per each column and so I cannot use :
for ( col in 1:ncol(Bands_reflectance_2017[5:534])){
colnames(Bands_reflectance_2017)[5:534] <- sub(".000000000", "", colnames(Bands_reflectance_2017)[5:534]) #Remove .000000000
}
etc
Also at the end of the day, I want to replace each bandname with a band coding system such as assign "nir-1" as "B8" and "12" as the month of "December" so that for example my first and second column header reads:
B7_December31
B8_February02
Cell 1
Cell 2
Cell 3
Cell 4
"B7_December31", "B8_February02" which are better naming to run in a random forest. Because I am running into problems of
Error in eval(predvars, data, env) : object '"t2017-12-31T08:20:04.000000000_red_edge_3"' not found
if I keep the naming convention in the example
I have the following column header names in my dataframe (Bands_reflectance_2017) of 534 columns :
"t2017-01-25T08:21:38.000000000_blue"
"t2017-08-23T08:22:22.000000000_green"
Cell 1
Cell 2
Cell 3
Cell 4
I want to remove everything except the date and band name e.g "2017_01_25_blue"
I tried:
for ( col in 1:ncol(Bands_reflectance_2017[5:534])){
colnames(Bands_reflectance_2017)[5:534] <- sub("T08", "", colnames(Bands_reflectance_2017)[5:534]) #Remove T08
But as some of the characters I want to remove are unique per each 534 columns, I am not sure how to remove them
I expect this at the end of the day:
2017_01_25_blue
2017_08_23_green
Cell 1
Cell 2
Cell 3
Cell 4
The later
"B2_December31", B3_August23
Cell 1
Cell 3
I also tried this :
substr(colnames(Bands_Reflectance_2017[2:335]),2,11)
What is the best way to do it? I am fairly new to programming and to r.
Thanks for sharing your code and data. Most people won't download random files. In the future you can share data with dput(data) or a smaller version with dput(head(data)).
library(stringr)
library(lubridate)
# Using the data frame that you provided with dput, which I call "df1" here
# You'll probably have to adjust the numbers between the [] because your
# data frame is vastly different from what I have and I'm not sure I have
# the write number, but since you said 534 columns, I'm using that.
df1 <- names(df1)[1:534]
band_names <- rep(NA, length(df1))
# This is messy. I'm sure someone who knows stringr or
# regex better has a neater way to do this.
# str_locate will find positions in a string and return the numeric value of the position
# str_sub uses positions to pull substrings
# gsub replaces patterns
# What this does is find the positions of the dates or labels,
# pulls out the substring, replaces things not needed
# (like "-" I used to mark positions), changed the number for date
# to something numeric so that month() can be switched from number to text.
for(i in 1:length(df1)) {
band_names[i] <- paste0(as.character(month(as.numeric(gsub("\\.","",
str_sub(df1[i],str_locate(df1[i],"\\.[0-9]{2}")))),
label=T, abbr = F)),gsub("T","",str_sub(df1[i],str_locate(df1[i],
"\\.[0-9]{2}T"))),"_",
str_sub(df1[i],str_locate(df1[i],"[a-z]{3,}.+")))}
# You can look at the results
band_names
[1] "Dec-12_red_edge_3" "Feb-02_nir_1" "Dec-12_swir_2"
# Split up band_names to replace the band label with number
band_out <- str_sub(band_names, 7)
band_stay <- str_sub(band_names, 1, 6)
# Made data frame up for the few example lines. I'm not downloading the CSV and I'm not going to find out the actual band names, labels, and numbers.
fake_bands <- data.frame(label = c("red_edge_3", "nir_1", "swir_2"), number = c("b1","b3","b2"))
# Change out labels for the numbers
band_replace <- fake_bands[match(band_out, fake_bands$label), "number"]
new_names <- paste0(band_stay, band_replace)
new_name
[1] "Dec-12_b1" "Feb-02_b3" "Dec-12_b2"
# Again, you might have to adjust the numbers in []
names(df1)[1:534] <- new_names
You're going to have to expand/replace the fake_bands data frame I made here with a data frame that has two columns. One column should have the labels, like "red_edge_3", and the other should have the appropriate band number.

How to compare changes in data profiles over time?

Say , I have datasets
df1=
structure(list(date = c("17.02.2021", "04.11.2020", "14.11.2020",
"24.11.2020", "29.11.2020", "04.12.2020", "09.12.2020"), x1 = c(0L,
0L, 7L, 0L, 0L, 0L, 0L), x2 = c(674L, 632L, 1036L, 656L, 736L,
762L, 698L), x3 = c(698L, 712L, 1140L, 704L, 784L, 786L, 722L
), x4 = c(522L, 472L, 988L, 464L, 608L, 578L, 514L), x5 = c(2408L,
3256L, 2840L, 2840L, 2888L, 2632L, 2648L), x6 = c(1952L, 2336L,
2480L, 2208L, 2208L, 2144L, 2016L), x7 = c(1056L, 1120L, 1504L,
1056L, 1184L, 1184L, 1120L), x8 = c(1984L, 2464L, 2400L, 2144L,
2208L, 2144L, 2080L), x9 = c(2336L, 2976L, 2784L, 2464L, 2784L,
2528L, 2400L), x10 = c(2528L, 3232L, 3104L, 2848L, 2912L, 2592L,
2656L), x11 = c(1248L, 1312L, 1504L, 1312L, 1312L, 1312L, 1248L
)), class = "data.frame", row.names = c(NA, -7L))
each row it is date. for the first day data profile here
The second day has data profiles
and so on.
Here reference dataset
df2=structure(list(date = c("06.11.2019", "01.12.2019", "25.01.2020",
"04.02.2020", "09.02.2020", "14.02.2020"), x1 = c(12L, 0L, 1L,
6L, 23L, 1L), x2 = c(1272L, 1046L, 688L, 572L, 592L, 328L), x3 = c(1032L,
974L, 736L, 780L, 800L, 568L), x4 = c(792L, 862L, 496L, 476L,
592L, 296L), x5 = c(2232L, 1496L, 1784L, 2792L, 3064L, 3544L),
x6 = c(2976L, 1904L, 1632L, 1760L, 1376L, 1440L), x7 = c(1568L,
1248L, 1008L, 1120L, 992L, 800L), x8 = c(1888L, 1376L, 1632L,
2400L, 2464L, 2720L), x9 = c(2080L, 1504L, 1760L, 2848L,
2912L, 3296L), x10 = c(2400L, 1552L, 1824L, 2848L, 2928L,
3360L), x11 = c(2400L, 1504L, 1120L, 1040L, 784L, 736L)), class = "data.frame", row.names = c(NA,
-6L))
Is there a way or method that would compare the profile of each row of data in df1 with the reference dataset df2, if the profile is similar, then 1 otherwise 0
The date in both dataset can be different, the main problem is detect is profiles are similar or not.
My desired output. The Peter's code is good, but is it possible calculate The difference between profiles by variables for example
This code allows you to visually compare the reference and df1 profiles. As you can see that none of the profiles match exactly. Some profiles are similar, but without a definition of "similar" as pointed out by #user2974951 it's difficult to move this closer to an answer.
library(dplyr)
library(tidyr)
library(ggplot2)
# restructure the data to allow comparison between the datasets
df <-
expand.grid("date_ref" = df2$date, "date_df1" = df1$date) %>%
left_join(df2, by = c("date_ref" = "date")) %>%
left_join(df1, by = c("date_df1" = "date")) %>%
pivot_longer(starts_with("x"), names_to = c("var", "df"), names_sep = "\\.") %>%
mutate(df = if_else(df == "x", "ref", "df1"),
var = factor(var, paste0("x", 1:11)))
# now you can plot the data to compare profiles; had to add some formatting to make the graph readable.
ggplot(df, aes(var, value, group = df, colour = df))+
geom_line()+
facet_grid(date_ref~date_df1)+
labs(colour = "Dataset")+
theme_classic()+
theme(legend.position = "bottom",
axis.text.x = element_text(size = 6, angle = 90),
axis.text.y = element_text(size = 6),
strip.text = element_text(size = 6))
Created on 2021-04-07 by the reprex package (v1.0.0)
What you need to define first is what criteria of similarity you want to use and what your threshold level of similarity is (how similar the datasets need to be to be considered equivalent). Also the important factor is the nature of your data. For example whether you consider your x1..x11 to be independent or just different samples of the same set.
Depending on the answers it can be anything from comparing each df1[i,2:12] to df2[i,2:12] exactly (if they are just duplicates or not) to comparing both of them to NA and checking if they are both NA or both a known value. Something in between would be checking if the differences of each parameter for each line of the datasets are not greater then 0.05 of the minimal value for example and marking the line equivalent if all the parameters are OK or using something like Pearson's correlation coefficient (cor(x,y) function has it enabled by default) for each line and comparing its value to 0.5 for example (both 0.05 and 0.5 are just arbitrary numbers of course and they probably need to be adjusted somewhat). Or maybe the amount of matching points (compared exactly as integers or just similar to some degree) is a better indication for you. There are also known standard tests for sample group dissimilarities, time series dissimilarities, or other statistical hypothesis. Many of them are available in R from bundled packages and if you fancy something else then it is most likely already available in one of the extra packages you can easily download and install.

Finding change over time from a table in R - fitting a model to the data

this is the table that has the information, it is much larger but I have just included a small part of it
Part 1 : (solved) This might seem quite easy, but I am very stuck on creating code to make a graph which shows the change over time. The table shows infant mortality and I just need a graph showing how the infant mortality of one country has changed over time.
Part 2 : how can I fit a line to the data - for example, for just one country? I need to model the data as a linear response. How can I do this?
my_data <- structure(list(`1800` = c("Afghanistan", "Albania", "Algeria",
"Andorra", "Angola", "Antigua and Barbuda", "Argentina", "Armenia",
"Australia", "Austria"), `1801` = c(469L, 375L, 460L, NA, 486L,
474L, 402L, 371L, 391L, 387L), `1802` = c(469L, 375L, 460L, NA,
486L, 470L, 402L, 371L, 391L, 373L), `1803` = c(469L, 375L, 460L,
NA, 486L, 466L, 402L, 370L, 391L, 359L), `1804` = c(469L, 375L,
460L, NA, 486L, 462L, 402L, 370L, 391L, 346L), `1805` = c(469L,
375L, 460L, NA, 486L, 458L, 402L, 369L, 391L, 333L), `1806` = c(469L,
375L, 460L, NA, 486L, 455L, 402L, 369L, 391L, 321L), `1807` = c(470L,
375L, 460L, NA, 486L, 451L, 402L, 368L, 391L, 309L), `1808` = c(470L,
375L, 460L, NA, 486L, 447L, 402L, 368L, 391L, 325L), `1809` = c(470L,
375L, 460L, NA, 486L, 444L, 402L, 367L, 391L, 316L)), row.names = c(NA,
10L), class = "data.frame")
You can plot what the question asks for with package ggplot2.
This type of problems generaly has to do with reshaping the data. The format should be the long format and the data is in wide format. See this post on how to reshape the data from long to wide format.
library(dplyr)
library(tidyr)
library(ggplot2)
my_data %>%
rename(country = `1800`) %>%
pivot_longer(
cols = starts_with('18'),
names_to = 'time',
values_to = 'mortality'
) %>%
mutate(time = as.numeric(time)) %>%
ggplot(aes(time, mortality, color = country)) +
geom_point() +
geom_line()
We can use matplot from base R
matplot(t('row.names<-'(as.matrix(my_data[-1]), my_data[[1]])),
type = 'l', xaxt = 'n')
legend("top", my_data[[1]], col = seq_along(my_data[[1]]),
fill = seq_along(my_data[[1]]))

Heat map soccer game in r

I have a dataframe with the position of a player on a pitch.
The bounding box for the area is 0 - 1000 and 0 - 750.
The starting ball position is 375-500 and the starting player position is 637-692.
I was trying using geom_tile but I can't get a heat map. How can I link the variables to make a heatmap?
frames <- structure(list(half = c("1T", "1T", "1T", "1T", "1T", "1T", "1T",
"1T", "1T", "1T", "1T", "1T", "1T", "1T", "1T", "1T", "1T", "1T",
"1T", "1T", "2T", "2T", "2T", "2T"), `ball-X` = c(375L, 375L,
375L, 375L, 375L, 372L, 365L, 358L, 351L, 344L, 338L, 332L, 326L,
320L, 315L, 310L, 305L, 301L, 300L, 309L, 631L, 631L, 631L, 631L
), `ball-Y` = c(500L, 500L, 500L, 500L, 500L, 490L, 470L, 450L,
432L, 414L, 397L, 381L, 365L, 350L, 336L, 322L, 309L, 297L, 302L,
304L, 577L, 582L, 589L, 596L), `L-2-X` = c(637L, 637L, 636L,
636L, 639L, 639L, 641L, 643L, 645L, 648L, 652L, 656L, 660L, 665L,
669L, 672L, 673L, 674L, 673L, 672L, 227L, 230L, 233L, 235L),
`L-2-Y` = c(692L, 692L, 691L, 688L, 685L, 684L, 681L, 678L,
674L, 669L, 663L, 657L, 649L, 641L, 633L, 624L, 615L, 606L,
596L, 587L, 438L, 445L, 452L, 460L)), class = "data.frame", row.names = c(NA,
-24L))
ggplot(frames, aes(x = `L-2-X`, y = `L-2-Y`)) +
scale_x_continuous(limits = c(0,750))+
scale_y_continuous(limits = c(0,1000))+
geom_tile(aes(fill = `L-2-X`)) +
scale_fill_viridis_c(option = "B", direction = -1) +
theme_light()+
facet_grid(~ half)
Not sure about the final result you are trying to achieve. As far as I get it your code works fine. However, your tiles are simply to small for being visible. Only when I zoomed the plot some tiny tiles appeared.
Therefore I would recommend to bin the data to get a nice heatmap. As an example my code below bins the data in squares of size 25 to 25 (cm??). For the fill I simply count the number of obs per square. Another approach would be to use e.g. geom_hex which uses hexagons for the binning.
library(ggplot2)
library(dplyr)
# Bin data
frames_bin <- frames %>%
# Bin data
mutate(l_2_x = cut(`L-2-X`, breaks = seq(0, 750, 25), labels = seq(0, 725, 25), include.lowest = TRUE),
l_2_y = cut(`L-2-Y`, breaks = seq(0, 1000, 25), labels = seq(0, 975, 25), include.lowest = TRUE)) %>%
# Count number of obs per bin
count(half, l_2_x, l_2_y) %>%
# Convert factors to numeric
mutate_at(vars(l_2_x, l_2_y), ~ as.numeric(as.character(.x)))
ggplot(frames_bin) +
scale_x_continuous(limits = c(0, 750)) +
scale_y_continuous(limits = c(0, 1000)) +
geom_tile(aes(x = l_2_x, y = l_2_y, fill = n)) +
scale_fill_viridis_c(option = "B", direction = -1) +
theme_light()+
facet_grid(~ half)
# Out of the box: use geom_hex
ggplot(frames) +
scale_x_continuous(limits = c(0, 750)) +
scale_y_continuous(limits = c(0, 1000)) +
geom_hex(aes(x = `L-2-X`, y = `L-2-Y`, fill = ..ncount..)) +
scale_fill_viridis_c(option = "B", direction = -1) +
theme_light()+
facet_grid(~ half)

monthly average of working days data

I have daily time series (of working days) which I would like to transform in monthly average.
The date format is %d/%m/%Y, moreover there are some missing observations (NA).
How can I do this?
# my data
timeseries <- structure(c(309L, 319L, 329L, 339L, 348L, 374L, 384L, 394L, 404L, 413L,
2317L, 2327L, 2337L, 2347L, 2356L, 2382L, 2392L, 2402L, 2412L, 2421L, 2447L, 2457L,
422L, 432L, 441L, 467L, 477L, 487L, 497L, 506L, 2467L, 2477L, 2487L, 2497L, 2506L,
2532L, 2542L, 2552L, 2562L, 2571L, 2597L, 2607L, 2617L, 2627L, 2636L,
[...]), .Label = c("01/01/1992", "01/01/1993", "01/01/1996", "01/01/1997", "01/01/1998", "01/01/1999", "01/01/2001 [...] ), class = "factor")
You can do this many, many ways. Using base R packages:
d <- data.frame(Date=Sys.Date()+1:60, Data=1:60)
tapply(d$Data, format(d$Date,"%Y%m"), mean)
aggregate(d$Data, by=list(Date=format(d$Date,"%Y%m")), mean)

Resources