subset a r dataframe based on conditions

subset a r dataframe based on conditions - r

I have a dataframe like this:
From this, I want to delete the rows which have same elements repeated in the multiple columns: assemblyName, qseqid, sseqid. At the same time, I want to keep the rows even if they are repeated but if there is an other row/rows with same assemblyName but have different qseqid and sseqid.
For example, from the above image: batch2_21032019_ENT924_assembly.fasta in the assemblyName has 4 records (rows: 747,748,771,785). If row 771,785 were not there, I would remove rows 747,748 which contains same assemblyName, qseqid and sseqid. But since here, there are row 771 and 785 which have different qseqid and sseqid, I wish to retain all the 4 rows.
But the last few rows from 1422 to 1503, I do not want to keep them because they are repeated in the columns assemblyName, qseqid, sseqid
Basically, what I want is a dataframe with the following output:
How do I achieve this in R? Here is my dput snippet:
structure(list(assemblyName = structure(c(5L, 12L, 12L, 24L,
24L, 42L, 48L, 48L, 48L, 48L, 76L, 76L, 76L, 79L, 79L, 79L), .Label = c("batch1_08032019_ENT1252_assembly.fasta",
"batch1_08032019_ENT1350_assembly.fasta", "batch1_08032019_ENT1368_assembly.fasta",
"batch1_08032019_ENT1382_assembly.fasta", "batch1_08032019_ENT1420_assembly.fasta",
"batch1_08032019_ENT1458_assembly.fasta", "batch1_08032019_ENT1489_assembly.fasta",
"batch14_02082019_ENT1646_assembly.fasta", "batch2_21032019_ENT1079_assembly.fasta",
"batch2_21032019_ENT1192_assembly.fasta", "batch2_21032019_ENT1219_assembly.fasta",
"batch2_21032019_ENT1250_assembly.fasta", "batch2_21032019_ENT1357_assembly.fasta",
"batch2_21032019_ENT1440_assembly.fasta", "batch2_21032019_ENT1669_assembly.fasta",
"batch2_21032019_ENT1758_assembly.fasta", "batch2_21032019_ENT1916_assembly.fasta",
"batch2_21032019_ENT1940_assembly.fasta", "batch2_21032019_ENT1968_assembly.fasta",
"batch2_21032019_ENT256_assembly.fasta", "batch2_21032019_ENT264_assembly.fasta",
"batch2_21032019_ENT267_assembly.fasta", "batch2_21032019_ENT268_assembly.fasta",
"batch2_21032019_ENT285_assembly.fasta", "batch2_21032019_ENT3_assembly.fasta",
"batch2_21032019_ENT310_assembly.fasta", "batch2_21032019_ENT312_assembly.fasta",
"batch2_21032019_ENT337_assembly.fasta", "batch2_21032019_ENT341_assembly.fasta",
"batch2_21032019_ENT358_assembly.fasta", "batch2_21032019_ENT360_assembly.fasta",
"batch2_21032019_ENT378_assembly.fasta", "batch2_21032019_ENT385_assembly.fasta",
"batch2_21032019_ENT421_assembly.fasta", "batch2_21032019_ENT422_assembly.fasta",
"batch2_21032019_ENT423_assembly.fasta", "batch2_21032019_ENT454_assembly.fasta",
"batch2_21032019_ENT465_assembly.fasta", "batch2_21032019_ENT466_assembly.fasta",
"batch2_21032019_ENT473_assembly.fasta", "batch2_21032019_ENT497_assembly.fasta",
"batch2_21032019_ENT5_assembly.fasta", "batch2_21032019_ENT50_assembly.fasta",
"batch2_21032019_ENT595_assembly.fasta", "batch2_21032019_ENT607_assembly.fasta",
"batch2_21032019_ENT708_assembly.fasta", "batch2_21032019_ENT807_assembly.fasta",
"batch2_21032019_ENT924_assembly.fasta", "batch20_11102019_ENT1249_assembly.fasta",
"batch20_11102019_ENT783_assembly.fasta", "batch20_11102019_ENT784_assembly.fasta",
"batch20_11102019_ENT785_assembly.fasta", "batch20_11102019_ENT835_assembly.fasta",
"batch20_11102019_ENT849_assembly.fasta", "batch20_11102019_ENT897_assembly.fasta",
"batch20_11102019_ENT901_assembly.fasta", "batch20_11102019_ENT903_assembly.fasta",
"batch20_11102019_ENT912_assembly.fasta", "batch20_11102019_ENT916_assembly.fasta",
"batch20_11102019_ENT938_assembly.fasta", "batch20_11102019_ENT965_assembly.fasta",
"batch20_11102019_ENT981_assembly.fasta", "batch20_11102019_ENT983_assembly.fasta",
"batch20_11102019_ENT990_assembly.fasta", "batch21x_16102019_ENT1251_assembly.fasta",
"batch21x_16102019_ENT1262_assembly.fasta", "batch21x_16102019_ENT1263_assembly.fasta",
"batch21x_16102019_ENT1266_assembly.fasta", "batch21x_16102019_ENT1267_assembly.fasta",
"batch21x_16102019_ENT1271_assembly.fasta", "batch21x_16102019_ENT1274_assembly.fasta",
"batch21x_16102019_ENT1276_assembly.fasta", "batch21x_16102019_ENT1278_assembly.fasta",
"batch21x_16102019_ENT1279_assembly.fasta", "batch21x_16102019_ENT1280_assembly.fasta",
"batch21x_16102019_ENT1288_assembly.fasta", "batch21x_16102019_ENT1296_assembly.fasta",
"batch21x_16102019_ENT1300_assembly.fasta", "batch21x_16102019_ENT1321_assembly.fasta",
"batch21x_16102019_ENT1322_assembly.fasta", "batch21x_16102019_ENT1325_assembly.fasta",
"batch21x_16102019_ENT1330_assembly.fasta", "batch21x_16102019_ENT1384_assembly.fasta",
"batch21x_16102019_ENT1393_assembly.fasta", "batch21x_16102019_ENT1394_assembly.fasta",
"batch21x_16102019_ENT1396_assembly.fasta", "batch21x_16102019_ENT1465_assembly.fasta",
"batch21x_16102019_ENT1502_assembly.fasta", "batch21x_16102019_ENT1570_assembly.fasta",
"batch21x_16102019_ENT1599_assembly.fasta", "batch21x_16102019_ENT1649_assembly.fasta",
"batch21x_16102019_ENT1676_assembly.fasta", "batch21x_16102019_ENT1681_assembly.fasta",
"batch21x_16102019_ENT1691_assembly.fasta", "batch21x_16102019_ENT1837_assembly.fasta",
"batch21x_16102019_ENT1895_assembly.fasta", "batch21x_16102019_ENT1896_assembly.fasta",
"batch21x_16102019_ENT1929_assembly.fasta", "batch21x_16102019_ENT1941_assembly.fasta",
"batch21x_16102019_ENT209_assembly.fasta", "batch21x_16102019_ENT689_assembly.fasta",
"batch21x_16102019_ENT732_assembly.fasta", "batch21x_16102019_ENT790_assembly.fasta",
"batch22_18102019_ENT1331_assembly.fasta", "batch22_18102019_ENT1336_assembly.fasta",
"batch22_18102019_ENT1337_assembly.fasta", "batch22_18102019_ENT1352_assembly.fasta",
"batch22_18102019_ENT1359_assembly.fasta", "batch22_18102019_ENT1413_assembly.fasta",
"batch22_18102019_ENT1475_assembly.fasta", "batch22_18102019_ENT1515_assembly.fasta",
"batch22_18102019_ENT1559_assembly.fasta", "batch22_18102019_ENT1580_assembly.fasta",
"batch22_18102019_ENT1595_assembly.fasta"), class = "factor"),
qseqid = structure(c(107L, 71L, 89L, 109L, 122L, 119L, 19L,
19L, 69L, 117L, 61L, 61L, 61L, 72L, 72L, 72L), .Label = c("",
"1_length=4775743_depth=1.00x_circular=true", "1_length=4782442_depth=1.00x_circular=true",
"1_length=4798941_depth=1.00x_circular=true", "1_length=4811272_depth=1.00x_circular=true",
"1_length=4854518_depth=1.00x_circular=true", "1_length=4870013_depth=1.00x",
"1_length=4877560_depth=1.00x_circular=true", "1_length=4879405_depth=1.00x_circular=true",
"1_length=4880726_depth=1.00x_circular=true", "1_length=4910657_depth=1.00x_circular=true",
"1_length=4945396_depth=1.00x_circular=true", "1_length=4980803_depth=1.00x_circular=true",
"1_length=4995045_depth=1.00x_circular=true", "1_length=4995093_depth=1.00x_circular=true",
"1_length=5004019_depth=1.00x_circular=true", "1_length=5024487_depth=1.00x_circular=true",
"1_length=5386431_depth=1.00x_circular=true", "1_length=5418220_depth=1.00x_circular=true",
"10_length=167596_depth=0.99x_circular=true", "10_length=41259_depth=2.09x_circular=true",
"19_length=13505_depth=0.90x", "2_length=123974_depth=3.35x_circular=true",
"2_length=174608_depth=2.06x_circular=true", "2_length=177751_depth=2.86x_circular=true",
"2_length=258181_depth=1.64x_circular=true", "2_length=278408_depth=1.57x_circular=true",
"2_length=41183_depth=3.34x_circular=true", "2_length=41190_depth=5.16x_circular=true",
"2_length=41215_depth=3.01x_circular=true", "2_length=41217_depth=2.25x_circular=true",
"2_length=71861_depth=0.77x_circular=true", "2_length=71861_depth=2.89x_circular=true",
"2_length=72968_depth=0.51x_circular=true", "2_length=91069_depth=1.21x_circular=true",
"2_length=91643_depth=2.11x_circular=true", "2_length=92072_depth=0.81x_circular=true",
"20_length=5469_depth=1.62x", "22_length=90789_depth=1.44x_circular=true",
"3_length=112875_depth=0.98x_circular=true", "3_length=118064_depth=3.79x_circular=true",
"3_length=127528_depth=1.73x_circular=true", "3_length=164596_depth=1.02x_circular=true",
"3_length=165091_depth=1.16x_circular=true", "3_length=165095_depth=2.12x_circular=true",
"3_length=165543_depth=0.59x_circular=true", "3_length=174323_depth=1.93x_circular=true",
"3_length=174796_depth=0.74x_circular=true", "3_length=180232_depth=1.88x_circular=true",
"3_length=180817_depth=1.81x_circular=true", "3_length=38610_depth=3.37x_circular=true",
"3_length=41182_depth=3.37x_circular=true", "3_length=41182_depth=4.04x_circular=true",
"3_length=41184_depth=4.98x_circular=true", "3_length=41185_depth=5.84x_circular=true",
"3_length=41186_depth=3.26x_circular=true", "3_length=41232_depth=2.49x_circular=true",
"3_length=50138_depth=1.79x_circular=true", "3_length=58175_depth=0.39x",
"3_length=62334_depth=2.76x_circular=true", "3_length=67915_depth=0.42x",
"3_length=71861_depth=2.39x_circular=true", "3_length=71861_depth=2.99x_circular=true",
"3_length=72145_depth=0.97x_circular=true", "3_length=72168_depth=0.80x_circular=true",
"3_length=731673_depth=1.22x", "3_length=74789_depth=2.02x_circular=true",
"3_length=74794_depth=2.26x_circular=true", "3_length=75214_depth=2.77x_circular=true",
"3_length=79594_depth=1.46x_circular=true", "3_length=88353_depth=2.00x_circular=true",
"3_length=89872_depth=0.49x_circular=true", "3_length=90666_depth=2.61x_circular=true",
"3_length=96544_depth=1.98x_circular=true", "38_length=14280_depth=2.50x",
"39_length=41187_depth=6.10x_circular=true", "4_length=129927_depth=0.88x",
"4_length=161129_depth=0.64x_circular=true", "4_length=165104_depth=0.58x_circular=true",
"4_length=170202_depth=0.80x", "4_length=41182_depth=1.27x_circular=true",
"4_length=41186_depth=4.34x_circular=true", "4_length=41188_depth=2.88x_circular=true",
"4_length=41190_depth=2.44x_circular=true", "4_length=41190_depth=3.46x_circular=true",
"4_length=41215_depth=3.66x_circular=true", "4_length=41224_depth=2.50x_circular=true",
"4_length=46161_depth=2.45x_circular=true", "4_length=51479_depth=1.11x_circular=true",
"4_length=71795_depth=2.16x_circular=true", "4_length=71859_depth=1.18x_circular=true",
"4_length=71861_depth=0.80x_circular=true", "4_length=71861_depth=1.56x_circular=true",
"4_length=71861_depth=1.95x_circular=true", "4_length=71861_depth=3.09x_circular=true",
"4_length=71861_depth=3.28x_circular=true", "4_length=71868_depth=0.67x_circular=true",
"4_length=71875_depth=0.43x_circular=true", "4_length=72162_depth=0.61x_circular=true",
"4_length=72162_depth=1.28x_circular=true", "4_length=73397_depth=1.60x_circular=true",
"4_length=73399_depth=2.01x_circular=true", "4_length=88057_depth=1.72x_circular=true",
"46_length=5494_depth=4.49x", "5_length=110787_depth=5.28x_circular=true",
"5_length=41185_depth=3.00x_circular=true", "5_length=41190_depth=2.13x_circular=true",
"5_length=42336_depth=2.31x_circular=true", "5_length=46161_depth=2.20x_circular=true",
"5_length=51479_depth=1.02x_circular=true", "5_length=51479_depth=2.10x_circular=true",
"5_length=55129_depth=3.86x_circular=true", "5_length=6141_depth=16.45x_circular=true",
"5_length=62044_depth=5.10x", "5_length=6211_depth=4.24x_circular=true",
"5_length=65498_depth=0.98x_circular=true", "5_length=70472_depth=2.31x",
"5_length=71861_depth=1.24x_circular=true", "6_length=41190_depth=4.77x_circular=true",
"6_length=46161_depth=0.86x_circular=true", "6_length=71861_depth=2.24x_circular=true",
"6_length=7604_depth=3.49x_circular=true", "6_length=80977_depth=0.65x_circular=true",
"6_length=95567_depth=1.42x_circular=true", "64_length=6420_depth=2.15x"
), class = "factor"), sseqid = c("NDM-1", "NDM-5", "OXA-181",
"NDM-5", "OXA-181", "NDM-1", "OXA-181", "OXA-181", "NDM-5",
"NDM-5", "OXA-181", "OXA-181", "OXA-181", "OXA-181", "OXA-181",
"OXA-181"), qlen = c(41190L, 88353L, 51479L, 46161L, 7604L,
41190L, 5418220L, 5418220L, 75214L, 70472L, 67915L, 67915L,
67915L, 89872L, 89872L, 89872L), qstart = c(23131L, 14408L,
25135L, 25547L, 5873L, 23131L, 5244180L, 4252066L, 36917L,
20047L, 51138L, 44729L, 38320L, 4678L, 11087L, 88141L), qend = c(23943L,
15220L, 25932L, 26359L, 6670L, 23943L, 5244977L, 4252863L,
37729L, 20859L, 51935L, 45526L, 39117L, 5475L, 11884L, 88938L
)), .Names = c("assemblyName", "qseqid", "sseqid", "qlen",
"qstart", "qend"), row.names = c(78L, 209L, 223L, 389L, 403L,
656L, 747L, 748L, 771L, 785L, 1422L, 1423L, 1424L, 1501L, 1502L,
1503L), class = "data.frame")

We can create a key column combining qseqid and sseqid and then select those assemblyName who either have more one distinct value of key or have only one row in them.
library(dplyr)
df %>%
mutate(key = paste0(qseqid, sseqid)) %>%
group_by(assemblyName) %>%
filter(n_distinct(key) > 1 | n() == 1) %>%
select(-key)
# assemblyName qseqid sseqid qlen qstart end
# <fct> <fct> <chr> <int> <int> <int>
# 1 batch1_08032019_ENT1420_assembly.fasta 5_length=41190_depth=2.13x_circular=true NDM-1 41190 23131 23943
# 2 batch2_21032019_ENT1250_assembly.fasta 3_length=88353_depth=2.00x_circular=true NDM-5 88353 14408 15220
# 3 batch2_21032019_ENT1250_assembly.fasta 4_length=51479_depth=1.11x_circular=true OXA-181 51479 25135 25932
# 4 batch2_21032019_ENT285_assembly.fasta 5_length=46161_depth=2.20x_circular=true NDM-5 46161 25547 26359
# 5 batch2_21032019_ENT285_assembly.fasta 6_length=7604_depth=3.49x_circular=true OXA-181 7604 5873 6670
# 6 batch2_21032019_ENT5_assembly.fasta 6_length=41190_depth=4.77x_circular=true NDM-1 41190 23131 23943
# 7 batch2_21032019_ENT924_assembly.fasta 1_length=5418220_depth=1.00x_circular=true OXA-181 5418220 5244180 5244977
# 8 batch2_21032019_ENT924_assembly.fasta 1_length=5418220_depth=1.00x_circular=true OXA-181 5418220 4252066 4252863
# 9 batch2_21032019_ENT924_assembly.fasta 3_length=75214_depth=2.77x_circular=true NDM-5 75214 36917 37729
#10 batch2_21032019_ENT924_assembly.fasta 5_length=70472_depth=2.31x NDM-5 70472 20047 20859

We can also use str_c
library(dplyr)
library(stringr)
library(dplyr)
df %>%
mutate(key = str_c(qseqid, sseqid)) %>%
group_by(assemblyName) %>%
slice(which(n_distinct(key) > 1 | n() == 1)) %>%
select(-key)

Related

How to pull out only one beta coefficient for each group separately from the regression equation and calculate mean by variables in R

here little example of my data.
sales.data=structure(list(MDM_Key = c(370L, 370L, 370L, 370L, 370L, 370L,
370L, 371L, 371L, 371L, 371L, 371L, 371L, 371L), sale_count = c(30L,
32L, 32L, 24L, 20L, 15L, 23L, 30L, 32L, 32L, 24L, 20L, 15L, 23L
), iek_disc_price = c(38227.08, 38227.08, 33739.7, 38227.08,
38227.08, 28844.16, 31649.255, 38227.08, 38227.08, 33739.7, 38227.08,
38227.08, 28844.16, 31649.255)), class = "data.frame", row.names = c(NA,
-14L))
i perform regression analysis
str(sales.data)
m1<-lm(formula=sale_count~iek_disc_price,data=sales.data)
summary(m1)
But the main difficulty is that for each group (MDM_Key) I don't need all the regression results from the summary, but only one beta coefficient.
here
B=0.0008559.
but then i need calculate mean value for sale_count and the mean for iek_disc_price (also for each mdm key group)
so the desired result would be like this
MDM_Key beta mean(sale_count) mean(iek_disc_price)
370 0.0008559 25.14 35305
371 0.0008559 25.14 35305
How to take only beta (nor intercept)regression coefficient for each group mdm_key
and also for each group, calculate the mean values for sale_count and iek_disc_price to get the summary table indicated above.
Thank you for your help.

If I understood correctly, you want to apply one regression per MDM_Key.
library(dplyr)
library(purrr)
library(broom)
sales.data %>%
group_by(MDM_Key) %>%
mutate(
mean_sale_count = mean(sale_count),
mean_iek_disc_price = mean(iek_disc_price)
) %>%
nest(-MDM_Key,-mean_sale_count,-mean_iek_disc_price) %>%
mutate(
coefs = map(.x = data,.f = ~tidy(lm(formula=.$sale_count~.$iek_disc_price,data=.)))
) %>%
unnest(coefs) %>%
filter(term != "(Intercept)") %>%
select(MDM_Key,beta = estimate,mean_sale_count,mean_iek_disc_price)
# A tibble: 2 x 4
# Groups: MDM_Key [2]
MDM_Key beta mean_sale_count mean_iek_disc_price
<int> <dbl> <dbl> <dbl>
1 370 0.000856 25.1 35306.
2 371 0.000856 25.1 35306.

Using R base and the split + apply + combine strategy:
do.call(rbind, lapply(split(sales.data, sales.data$MDM_Key), function(i) {
c(beta=coef(lm(sale_count~iek_disc_price, data=i))[2],
sale_count_mean=mean(i$sale_count),
iek_disc_price_mean=mean(i$iek_disc_price))
} ))
beta.iek_disc_price sale_count_mean iek_disc_price_mean
370 0.0008558854 25.14286 35305.92
371 0.0008558854 25.14286 35305.92

Get the means using aggregate and the beta values using lmList and then put them together and rearrange the columns in the order shown in the question. Omit [, c(2:1, 3:4)] if the column order doesn't matter. Note that nlme comes with R and does not have to be installed.
library(nlme) # lmList
means <- aggregate(. ~ MDM_Key, sales.data, mean)
fm <- lmList(sale_count ~ iek_disc_price | MDM_Key, sales.data)
cbind(beta = coef(fm)[, 2], means)[, c(2:1, 3:4)]
## MDM_Key beta sale_count iek_disc_price
## 1 370 0.0008558854 25.14286 35305.92
## 2 371 0.0008558854 25.14286 35305.92

Intersect buffer with vector to extract information

This is my first attempt at doing GIS analysis in R so hopefully what I've done so far and my question makes sense.
I have a data frame of points within the UK which I have converted to a sf object and calculated a 250m buffer around each point. I also have some land cover vector data which I've also converted into a sf object. The land cover data is split into land cover types.
I would like to calculate the percentage of different land cover types found within each 250m buffer. I've managed to get what I think is the vector data for each buffer, but I'm not sure how to extract the information (i.e. area/proportion of each land cover type) from each buffer.
This is what I've got so far:
XY data below
require(rgdal)
library(mapview)
library(sf)
sites <- structure(list(plot = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L,
23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L,
36L, 37L, 38L, 39L, 40L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 46L,
47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 55L, 56L, 57L, 58L,
59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L,
72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L,
85L, 86L, 87L, 88L, 89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L,
98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 106L, 107L, 108L,
109L, 110L, 111L, 112L, 113L, 114L, 115L, 116L, 117L, 118L, 119L,
120L, 171L, 172L, 173L, 174L, 175L, 176L, 177L, 178L, 179L, 180L,
181L, 182L, 183L, 184L, 185L, 186L, 187L, 188L, 189L, 190L),
gridref = c("ST7498436114", "ST7492436114", "ST7486436114",
"ST7528433234", "ST7516436174", "ST7498436174", "ST7522433294",
"ST7468435994", "ST7516436054", "ST7504436054", "ST7498436054",
"ST7576432994", "ST7576533004", "ST7492436054", "ST7486436054",
"ST7480436054", "ST7522436114", "ST7474435934", "ST7468435934",
"ST7570433114", "ST7504435994", "ST7582432874", "ST7498435994",
"ST7576432874", "ST7492435994", "ST7570432874", "ST7486435994",
"ST7480435994", "ST7474435994", "ST7564435094", "ST7558435094",
"ST7546432934", "ST7420435514", "ST7420435574", "ST7426435694",
"ST7576432814", "ST7426435754", "ST7420435754", "ST7426435814",
"ST7486435025", "ST7612432874", "ST7607132839", "ST7588435034",
"ST7582435034", "ST7576435034", "ST7522435034", "ST7498435034",
"ST7582432754", "ST7581632758", "ST7576435094", "ST7570435094",
"ST7570432754", "ST7516434914", "ST7564432754", "ST7510434914",
"ST7489834968", "ST7594434974", "ST7594432694", "ST7597532791",
"ST7588434974", "ST7528434974", "ST7522434974", "ST7504434974",
"ST7606434854", "ST7546434854", "ST7552432694", "ST7540434854",
"ST7522434854", "ST7492434854", "ST7600434914", "ST7540434914",
"ST7618432694", "ST7534434914", "ST7558434734", "ST7552434734",
"ST7504434734", "ST7498434734", "ST7576432574", "ST7606434794",
"ST7564432574", "ST7558434794", "ST7546434794", "ST7496634827",
"ST7618432634", "ST7630434494", "ST7624434554", "ST7522434554",
"ST7516434554", "ST7618434614", "ST7618434674", "ST7612434674",
"ST7564434734", "ST7648434314", "ST7648434374", "ST7570432454",
"ST7642434374", "ST7642434434", "ST7636434434", "ST7636434494",
"ST7576432394", "ST7522433714", "ST7588432274", "ST7516433774",
"ST7522433594", "ST7540433534", "ST7546433474", "ST7534433474",
"ST7528433474", "ST7492436174", "ST7522436234", "ST7486436234",
"ST7534433354", "ST7480436234", "ST7498436294", "ST7516433354",
"ST7492436294", "ST7486436294", "ST7516436114", "ST7504436114",
"ST7492433834", "ST7498433894", "ST7504433714", "ST7504433474",
"ST7576432634", "ST7527336046", "ST7533436050", "ST7496835930",
"ST7491135929", "ST7536336131", "ST7532436239", "ST7546033412",
"ST7533433668", "ST7546733054", "ST7546033171", "ST7460735809",
"ST7455135810", "ST7492533597", "ST7516133894", "ST7611032751",
"ST7599532629", "ST7563133004", "ST7558533053", "ST7594132635",
"ST7600032936"), lon = c(-2.35882798600066, -2.35968529791987,
-2.36054260970957, -2.35434318476877, -2.35626020256111,
-2.35883216905146, -2.35520415532218, -2.36310607872696,
-2.35625189650525, -2.35796650101658, -2.35882380307907,
-2.34747232387985, -2.34745871838488, -2.35968110501236,
-2.36053840681614, -2.36139570849011, -2.35539873703486,
-2.36224456469475, -2.36310184613728, -2.34833723210959,
-2.35796232820981, -2.3466074559054, -2.35881962028668, -2.34746423080273,
-2.35967691223436, -2.34832100557515, -2.36053420405253,
-2.36139149574089, -2.36224878729914, -2.34932832124224,
-2.35018546484601, -2.35175219973381, -2.36992990937229,
-2.36993422091177, -2.36908560394007, -2.34746018445156,
-2.36908990589506, -2.36994715632925, -2.36909420798293,
-2.36046634739346, -2.34232357955569, -2.34307806730496,
-2.3458957145579, -2.34675284868196, -2.34760998268123, -2.35532418300805,
-2.3587527131752, -2.34659938326637, -2.34671388643665, -2.34761403365783,
-2.34847117751274, -2.34831289305932, -2.35617301455928,
-2.34916964776812, -2.357030127205, -2.35997665493164, -2.34503455939921,
-2.3448818567799, -2.34444568960825, -2.3458916836673, -2.35446291945093,
-2.35532004233278, -2.35789141020988, -2.34331230896304,
-2.35188334891386, -2.35087907083258, -2.35274045221757,
-2.35531176136585, -2.35959727405167, -2.34417342420112,
-2.35274456269774, -2.34145487458687, -2.3536016758543, -2.35016098122337,
-2.35101806482134, -2.35787472903205, -2.35873181148214,
-2.34744400029595, -2.34330830838447, -2.34915745002822,
-2.35016506151211, -2.35187924854011, -2.35899542001203,
-2.34145089838956, -2.33986012737858, -2.34072114258117,
-2.35529106118602, -2.35614811408587, -2.34158217774011,
-2.34158615799082, -2.342443232856, -2.3493038974994, -2.33727720150278,
-2.33728113125349, -2.34829261396095, -2.33813815683999,
-2.33814209668946, -2.33899913213172, -2.33900308208057,
-2.34743186349066, -2.35523311768249, -2.34571042235175,
-2.35609417888437, -2.35522484208367, -2.35265005375973,
-2.3517890723444, -2.35350281992438, -2.35435969352383, -2.3596894909569,
-2.35540702337447, -2.3605510158859, -2.35349458523648, -2.36140834751853,
-2.35884053554068, -2.35606514581882, -2.35969787741951,
-2.36055521916881, -2.35625604946904, -2.35797067395225,
-2.35952605843373, -2.35867330390757, -2.35780385700866,
-2.35778718664423, -2.3474480461475, -2.35469391232931, -2.3538225975963,
-2.35904376714229, -2.35985811443216, -2.35341379671676,
-2.35397847963346, -2.3518419625423, -2.35365895139517, -2.35171755255592,
-2.35182550302169, -2.3641931796262, -2.36499336018916, -2.35949523507335,
-2.35614532235317, -2.34251531682126, -2.34414928167033,
-2.34937223011837, -2.35003243626169, -2.34492074441706,
-2.34409838966173), lat = c(51.1237571019443, 51.123754466421,
51.1237518246133, 51.0978735338664, 51.1243044830452, 51.1242966140316,
51.0984104468623, 51.1226648376649, 51.123225458519, 51.1232202189951,
51.1232175898066, 51.0957360604064, 51.0958260221032, 51.1232149543339,
51.1232123125767, 51.1232096645353, 51.1237675811927, 51.1221279861883,
51.12212532568, 51.0968125396748, 51.1226807067568, 51.0946595744632,
51.1226780776186, 51.0946570295548, 51.1226754421963, 51.0946544783684,
51.1226728004898, 51.122670152499, 51.1226674982241, 51.1146139554378,
51.1146113898789, 51.0951837260487, 51.1183272329659, 51.1188667448915,
51.1199484791649, 51.0941175140535, 51.1204879909912, 51.1204852803661,
51.1210275027673, 51.1139596733116, 51.0946722048351, 51.0943552673697,
51.1140846411133, 51.1140821007327, 51.1140795540699, 51.1140563514,
51.1140458757624, 51.0935805433128, 51.0936161720752, 51.1146190677084,
51.1146165147143, 51.0935754474132, 51.1129747154744, 51.0935728900467,
51.1129720998063, 51.1134486332581, 51.1135476613768, 51.0930460983536,
51.0939196213418, 51.1135451273271, 51.1135194413093, 51.1135168381554,
51.1135089910009, 51.1124736826157, 51.1124481860401, 51.0930282410747,
51.1124456018318, 51.1124378115152, 51.1124247020154, 51.1130106752108,
51.1129851153262, 51.0930561644051, 51.1129825247864, 51.1113743081748,
51.1113717366288, 51.1113509381199, 51.1113483100386, 51.0919594515448,
51.1119341685333, 51.091954343383, 51.111913821918, 51.1119086724458,
51.1121837658749, 51.0925166484142, 51.1092466189503, 51.1097836374543,
51.1097402440333, 51.109737634947, 51.1103206495309, 51.1108601638605,
51.1108576551798, 51.111376873439, 51.1076355248773, 51.1081750396969,
51.0908778691438, 51.1081725626614, 51.1087120773832, 51.1087095940192,
51.1092491086431, 51.0903409041346, 51.1021870483868, 51.0892669548417,
51.1027239542518, 51.1011080196316, 51.1005762922147, 51.1000393606328,
51.1000341882122, 51.1000315925832, 51.1242939784578, 51.1248466057182,
51.1248308485354, 51.0989551588552, 51.1248282003419, 51.1253756380551,
51.0989473534293, 51.1253730023804, 51.125370360421, 51.1237649708073,
51.1237597311831, 51.1032529717066, 51.1037951193141, 51.1021792043864,
51.1000211472752, 51.0924989672475, 51.1231582609789, 51.1231968706504,
51.1221018957455, 51.1220903988326, 51.1239264662945, 51.1248959019189,
51.0994816900486, 51.1017781863846, 51.0962628852802, 51.0973146386257,
51.1209979188271, 51.1210044144558, 51.1011219350515, 51.1038028520133,
51.0935656115046, 51.092463770299, 51.0958203158045, 51.0962589537862,
51.092515448027, 51.0952245028064)), class = "data.frame", row.names = c(NA,
-144L))
Convert XY data to sf object and create buffer
sites2 = st_as_sf(sites,coords=c("lon","lat"), dim = "XY", crs=4326)
coords <- st_geometry(sites2)
st_crs(coords)
buff_coords <- st_buffer(coords, dist = 250)
mapview(coords) + mapview(buff_coords)
Land cover data here
Read in land cover data and use st_intersection to get land cover data within each buffer
land_cover <- st_read(
"land_cover.shp")
buf_intersects <- st_intersection(buff_coords, land_cover)
buff <- st_as_sf(buf_intersects) # convert to sf object
mapview(coords, col.regions="red", cex=2) + mapview(buff)
This is where I've got to. It looks correct, each buffer has the vector lines within it.
What I'm struggling with is how to extract information from the buff object. I would like either area or proportion of each land cover type.
Any ideas? Thanks!

What you need to consider is that the output of st_intersection() keeps the attributes of x whereas y is only used for clipping. Since buff_coords is of type geometry set without any attributes, your output only includes geometry information. That's why your result looks correct. What is missing, and what you are interested in, are all the attributes to calculate areas, so simply switch the inputs:
st_intersection(land_cover, buff_coords)
#> Simple feature collection with 2153 features and 12 fields
#> Geometry type: GEOMETRY
#> Dimension: XY
#> Bounding box: xmin: -2.373569 ymin: 51.087 xmax: -2.333649 ymax: 51.12766
#> Geodetic CRS: WGS 84
#> First 10 features:
#> [...]
In order to determine proportions of land cover classes you might want to calculate areas of your classes within the buffer per site and use aggregate() to sum areas of the same class in order to get the weights desired.
I'm not quite sure if you can do this operation all at once, so I'll just show the workflow for the first site feature so that you should be able to construct a for-loop in order to iterate over all your features:
# clipping lc data with buffer of first site
x <- st_intersection(land_cover, sites[1,] |> st_geometry() |> st_buffer(dist = 250))
# calculate area
x["area"] <- st_area(x)
# sum area per INTCODE
aggregate(area ~ INTCODE, x, sum)
#> INTCODE area
#> 1 1 49098.67741 [m^2]
#> 2 2 148094.15274 [m^2]
#> 3 3 2090.11273 [m^2]
#> 4 4 55.95573 [m^2]
#> 5 5 41.40524 [m^2]
Edit:
I also noticed the area does not quite add up to pi * 250^2 and I guess this is a result of using st_buffer() and st_area() on a dataset with a geodetic coordinate reference system (EPSG: 4326). In order to reproject your data (land_cover & sites) to a commomly used projected system in your area of interest - I used WGS 84 / UTM zone 30N (EPSG: 32630) here, but you'll probably know better - you can make use of st_transform() resulting in desired values:
a1 <- sites |> st_buffer(dist = 250) |> st_area()
head(a1)
#> Units: [m^2]
#> [1] 199380.3 199244.6 199278.6 199276.3 199343.5 199301.7
a2 <- sites |> sf::st_transform("epsg:32630") |> st_buffer(dist = 250) |> st_area()
head(a2)
#> Units: [m^2]
#> [1] 196259.8 196259.8 196259.8 196259.8 196259.8 196259.8
However, since you seem to be interested in proportions only and not absolute areas per se, I would not expect significant distortions here based on your radius chosen - but I'll probably still go for a suitable projected reference system.

rounding up time in a date-time variable in R

I have been looking for a solution for a problem I thought is easy to solve but it doesn't seem so. I am trying roundup a DateTime Variable to the next 5 minutes.
Incidents<- structure(list(Event.Id = c(240252L, 240258L, 240261L, 240264L,
240294L, 240313L, 240314L, 240315L, 240331L, 240343L, 240350L,
240358L, 240379L, 240381L, 240396L), DateTime = structure(c(1427868300,
1427872800, 1427873100, 1427873700, 1427881500, 1427890260, 1427890860,
1427890980, 1427900160, 1427902860, 1427904480, 1427906700, 1427910900,
1427911920, 1427919360), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(5L,
11L, 14L, 15L, 40L, 59L, 60L, 61L, 77L, 90L, 96L, 103L, 122L,
123L, 135L), class = "data.frame")
I used:
Incidents$DateTime2<- round_date(Incidents$DateTime,unit="5 minutes")
However, this rounds the time to the closest 5 minutes. I want the next 5 minutes instead.
I tried this code below:
Incidents<- Incidents %>% mutate(time = as.numeric(substr(DateTime, 16,16)))
Incidents$time <- ifelse(Incidents$time <6,5- Incidents$time, 10 - Incidents$time)
minut<- minutes(5)
Incidents$DateTime3<- ifelse(Incidents$time <3, Incidents$DateTime2 + minut, Incidents$DateTime2)
However, this gives me numeric values that I could not convert to date time variable.

Check if a column contains a sequence

I want to know, if i can check out if a column of a data frame starts with 0 or 1 and goes till the number of rows without breaking the sequence. Below is a sample data frame.
structure(list(X = 1:22, SNR = c(1.0035798429, 11.9438978154,
NA, 3.2894877794, 4.0170266411, 1.6310522977, 1.6405414787, 1.6625412522,
0.8489116253, 7.5312259672, 7.2832910726, 0.5732577083, NA, 0.8149754292,
1.9981020389, 1.2477052103, 0.9960804911, 10.3402683931, 3.6328270728,
2.5540496855, 41.96873985, 6.2035281045), ID = c(109L, 110L,
111L, 112L, 113L, 114L, 116L, 117L, 118L, 119L, 120L, 121L, 123L,
124L, 125L, 126L, 127L, 128L, 130L, 131L, 132L, 133L), SignalIntensity = c(6.8173738339,
11.5459925418, NA, 9.7804203445, 9.8719842219, 9.0781857736,
8.2289312163, 8.0435364446, 6.1793458315, 10.5581798932, 10.4745329822,
4.1572943809, NA, 6.0451742752, 8.3100219509, 7.4558770659, 7.1464749962,
11.4284386394, 9.6273795753, 9.6807417299, 13.3364944397, 10.4304671876
)), .Names = c("X", "SNR", "ID", "SignalIntensity"), class = "data.frame", row.names = c(NA,
-22L))
How can i check the columns and return the index if present.
Edited: The sequence i am looking for is a natural sequence. Suppose if a data frame has 10 rows, the column if present should have a sequence 1,2,3,4,5,6,7,8,9,10 or may be like0,1,2,3,4,5,6,7,8,9. . So the sequence starts with 0 or 1 and goes till the number of rows with an increment of 1 for each row.

You could loop through the columns with sapply. Create a function to check whether there are any NAs. If not (!any), we get the difference (diff) between the adjacent element, check if all the element difference is 1 (all(diff(x)==1) and (&) the first value of the column is 0 or 1 (x[1] %in% 0:1). If there is any NA, the output for that column will be 'FALSE'.
f1 <- function(x) {
if(!any(is.na(x)))
all(diff(x)==1) & x[1] %in% 0:1
else FALSE}
which(sapply(df, f1))
#X
#1

How to extract block of rows in R

here is a example of my data frame (the original has ~ 10 000 rows). I would like to extract blocks of row based on VariableC. I only want to keep rows between FALSEs. But only "blocks" with a minimum number of rows of 10 (randomly located in the data frame) and discard the others. In other words, I want to split my data frame into sub data frames (i.e. block of rows). An alternative would be to create a new column with each block having an individual number or letter. The end goal is to plot (regression) VariableA and VariableB for each block and extract the regression and slope coefficients of each block. I know how to do the last part but I can't find a solution on how to extract the blocks.
dput(DF)
structure(list(VariableA = c(-0.427796831, -0.985783635, 0.07381913,
-0.788768923, 2.088999368, 1.634064399, -0.396180684, 1.242763624,
-0.925287904, -1.127545153, -1.392674655, -0.988900906, -0.08007986,
1.123984722, 0.698530819, -0.983565282, 0.568517376, -0.349446274,
0.451443794, -0.525897224, -0.932426185, -1.026114049, -0.502973503,
0.779152951, -0.636137726, -0.488850226, 0.281389897, -0.058183652,
-0.490377469, 0.541441864, 0.101754052, -0.16701156, 0.830697787,
0.383672008, 0.376444634, 0.377695822, -0.167281753, 0.85629382,
0.213632586, -0.180474289, 1.008370316, -0.039110304, -0.498537412,
-2.804652051, -0.308652164, -0.57234963, 0.599951896, 0.52484456,
0.008141731, -0.355182154, -0.401441593, 1.201478908, 0.656311257,
0.459034655), VariableB = c(-0.599169932, -0.874625086, -0.879367189,
0.068133167, -0.800781757, -0.746429115, -0.231178499, -0.905456972,
0.40165965, 0.664579078, -0.386614574, -0.700272577, 1.844891234,
0.277616227, 0.560119708, -2.874313318, 0.835592571, -0.66310824,
0.770336487, 1.547635124, -0.604065751, 1.009519877, -0.54792181,
-0.904229067, -0.309270319, 0.16088111, 0.325712725, -0.931632811,
-1.124531146, -0.24012375, -0.887921437, -1.531276383, 1.565233292,
0.462452663, 0.836271408, -0.721959208, 1.92215585, 0.189964832,
1.661140854, -1.604886269, -1.237132008, 0.811584528, -0.965798536,
2.604504203, -1.124331258, 0.240004185, -0.34902354, -0.447056073,
0.051475583, 0.159486311, -1.86620661, -1.671688795, -1.268626575,
-1.734731137), VariableC = structure(c(11L, 19L, 9L, 36L, 36L,
26L, 7L, 24L, 36L, 5L, 17L, 15L, 33L, 30L, 29L, 21L, 31L, 10L,
36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 8L,
16L, 35L, 25L, 28L, 4L, 32L, 27L, 34L, 18L, 36L, 36L, 14L, 2L,
13L, 3L, 36L, 23L, 22L, 1L, 20L, 6L, 36L, 12L), .Label = c("-0.019569584",
"-0.020014785", "-0.033234545", "-0.034426339", "-0.046296608",
"-0.047020989", "-0.062735918", "-0.078616739", "-0.080554806",
"-0.101255451", "-0.102696676", "-0.127569648", "-0.143298342",
"-0.146433595", "-0.168917348", "-0.169828794", "-0.177928923",
"-0.178536056", "-0.186040872", "-0.22676482", "-0.38578786",
"0.005961731", "0.007778849", "0.033730665", "0.084612467", "0.088763528",
"0.104625865", "0.121271604", "0.125865053", "0.140160095", "0.140410995",
"0.17548741", "0.176481137", "0.187477344", "0.239593108", "FALSE"
), class = "factor")), .Names = c("VariableA", "VariableB", "VariableC"
), class = "data.frame", row.names = c(NA, -54L))

Here's an approach:
# create indicator variable
df$ind <- cumsum(df$VariableC == "FALSE")
# remove "FALSE" rows
df_sub <- df[df$VariableC != "FALSE", ]
# run a regression for each unique ind value
library(MASS)
lmList(VariableA ~ VariableB | ind, data = df_sub)
The result:
Call: lmList(formula = VariableA ~ VariableB | ind, data = df_sub)
Coefficients:
(Intercept) VariableB
0 -0.40531670 0.05261483
2 -0.93213791 -2.80237922
3 -0.26593782 0.31197216
15 0.24240710 0.10646927
17 -0.92256481 -0.65475348
18 0.02793152 -0.22209490
19 0.45903466 NA
Degrees of freedom: 35 total; 21 residual
Residual standard error: 0.6656342
How to create a plot?
library(ggplot2)
ggplot(df_sub, aes(x = VariableB, y = VariableA)) +
geom_point() +
facet_wrap( ~ ind) +
geom_smooth(method = lm)

You could do as follows:
falseIdx <- which(as.character(DF$VariableC) == "FALSE")
# at least 2 FALSE's must be present...
if(length(falseIdx) >= 2){
blocks <-
lapply(2:(length(falseIdx)-1),FUN=function(idx){
currFalse <- falseIdx[idx]
prevFalse <- falseIdx[idx-1]
# we build a block only if it has at least 10 rows
if(currFalse - prevFalse - 1 >= 10){
return(DF[(prevFalse+1):(currFalse-1),])
}else{
return(NULL)
}
})
# remove nulls
blocks[sapply(blocks, is.null)] <- NULL
}else{
blocks <- list()
}
Computing on your example data, blocks contains only one data.frame:
> blocks
[[1]]
VariableA VariableB VariableC
31 0.1017541 -0.8879214 -0.078616739
32 -0.1670116 -1.5312764 -0.169828794
33 0.8306978 1.5652333 0.239593108
34 0.3836720 0.4624527 0.084612467
35 0.3764446 0.8362714 0.121271604
36 0.3776958 -0.7219592 -0.034426339
37 -0.1672818 1.9221558 0.17548741
38 0.8562938 0.1899648 0.104625865
39 0.2136326 1.6611409 0.187477344
40 -0.1804743 -1.6048863 -0.178536056

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

subset a r dataframe based on conditions - r

We can also use str_c library(dplyr) library(stringr) library(dplyr) df %>% mutate(key = str_c(qseqid, sseqid)) %>% group_by(assemblyName) %>% slice(which(n_distinct(key) > 1 | n() == 1)) %>% select(-key)

Related

How to pull out only one beta coefficient for each group separately from the regression equation and calculate mean by variables in R

Intersect buffer with vector to extract information

rounding up time in a date-time variable in R

Check if a column contains a sequence

How to extract block of rows in R

Categories

Resources