How to extract block of rows in R

How to extract block of rows in R - r

here is a example of my data frame (the original has ~ 10 000 rows). I would like to extract blocks of row based on VariableC. I only want to keep rows between FALSEs. But only "blocks" with a minimum number of rows of 10 (randomly located in the data frame) and discard the others. In other words, I want to split my data frame into sub data frames (i.e. block of rows). An alternative would be to create a new column with each block having an individual number or letter. The end goal is to plot (regression) VariableA and VariableB for each block and extract the regression and slope coefficients of each block. I know how to do the last part but I can't find a solution on how to extract the blocks.
dput(DF)
structure(list(VariableA = c(-0.427796831, -0.985783635, 0.07381913,
-0.788768923, 2.088999368, 1.634064399, -0.396180684, 1.242763624,
-0.925287904, -1.127545153, -1.392674655, -0.988900906, -0.08007986,
1.123984722, 0.698530819, -0.983565282, 0.568517376, -0.349446274,
0.451443794, -0.525897224, -0.932426185, -1.026114049, -0.502973503,
0.779152951, -0.636137726, -0.488850226, 0.281389897, -0.058183652,
-0.490377469, 0.541441864, 0.101754052, -0.16701156, 0.830697787,
0.383672008, 0.376444634, 0.377695822, -0.167281753, 0.85629382,
0.213632586, -0.180474289, 1.008370316, -0.039110304, -0.498537412,
-2.804652051, -0.308652164, -0.57234963, 0.599951896, 0.52484456,
0.008141731, -0.355182154, -0.401441593, 1.201478908, 0.656311257,
0.459034655), VariableB = c(-0.599169932, -0.874625086, -0.879367189,
0.068133167, -0.800781757, -0.746429115, -0.231178499, -0.905456972,
0.40165965, 0.664579078, -0.386614574, -0.700272577, 1.844891234,
0.277616227, 0.560119708, -2.874313318, 0.835592571, -0.66310824,
0.770336487, 1.547635124, -0.604065751, 1.009519877, -0.54792181,
-0.904229067, -0.309270319, 0.16088111, 0.325712725, -0.931632811,
-1.124531146, -0.24012375, -0.887921437, -1.531276383, 1.565233292,
0.462452663, 0.836271408, -0.721959208, 1.92215585, 0.189964832,
1.661140854, -1.604886269, -1.237132008, 0.811584528, -0.965798536,
2.604504203, -1.124331258, 0.240004185, -0.34902354, -0.447056073,
0.051475583, 0.159486311, -1.86620661, -1.671688795, -1.268626575,
-1.734731137), VariableC = structure(c(11L, 19L, 9L, 36L, 36L,
26L, 7L, 24L, 36L, 5L, 17L, 15L, 33L, 30L, 29L, 21L, 31L, 10L,
36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 8L,
16L, 35L, 25L, 28L, 4L, 32L, 27L, 34L, 18L, 36L, 36L, 14L, 2L,
13L, 3L, 36L, 23L, 22L, 1L, 20L, 6L, 36L, 12L), .Label = c("-0.019569584",
"-0.020014785", "-0.033234545", "-0.034426339", "-0.046296608",
"-0.047020989", "-0.062735918", "-0.078616739", "-0.080554806",
"-0.101255451", "-0.102696676", "-0.127569648", "-0.143298342",
"-0.146433595", "-0.168917348", "-0.169828794", "-0.177928923",
"-0.178536056", "-0.186040872", "-0.22676482", "-0.38578786",
"0.005961731", "0.007778849", "0.033730665", "0.084612467", "0.088763528",
"0.104625865", "0.121271604", "0.125865053", "0.140160095", "0.140410995",
"0.17548741", "0.176481137", "0.187477344", "0.239593108", "FALSE"
), class = "factor")), .Names = c("VariableA", "VariableB", "VariableC"
), class = "data.frame", row.names = c(NA, -54L))

Here's an approach:
# create indicator variable
df$ind <- cumsum(df$VariableC == "FALSE")
# remove "FALSE" rows
df_sub <- df[df$VariableC != "FALSE", ]
# run a regression for each unique ind value
library(MASS)
lmList(VariableA ~ VariableB | ind, data = df_sub)
The result:
Call: lmList(formula = VariableA ~ VariableB | ind, data = df_sub)
Coefficients:
(Intercept) VariableB
0 -0.40531670 0.05261483
2 -0.93213791 -2.80237922
3 -0.26593782 0.31197216
15 0.24240710 0.10646927
17 -0.92256481 -0.65475348
18 0.02793152 -0.22209490
19 0.45903466 NA
Degrees of freedom: 35 total; 21 residual
Residual standard error: 0.6656342
How to create a plot?
library(ggplot2)
ggplot(df_sub, aes(x = VariableB, y = VariableA)) +
geom_point() +
facet_wrap( ~ ind) +
geom_smooth(method = lm)

You could do as follows:
falseIdx <- which(as.character(DF$VariableC) == "FALSE")
# at least 2 FALSE's must be present...
if(length(falseIdx) >= 2){
blocks <-
lapply(2:(length(falseIdx)-1),FUN=function(idx){
currFalse <- falseIdx[idx]
prevFalse <- falseIdx[idx-1]
# we build a block only if it has at least 10 rows
if(currFalse - prevFalse - 1 >= 10){
return(DF[(prevFalse+1):(currFalse-1),])
}else{
return(NULL)
}
})
# remove nulls
blocks[sapply(blocks, is.null)] <- NULL
}else{
blocks <- list()
}
Computing on your example data, blocks contains only one data.frame:
> blocks
[[1]]
VariableA VariableB VariableC
31 0.1017541 -0.8879214 -0.078616739
32 -0.1670116 -1.5312764 -0.169828794
33 0.8306978 1.5652333 0.239593108
34 0.3836720 0.4624527 0.084612467
35 0.3764446 0.8362714 0.121271604
36 0.3776958 -0.7219592 -0.034426339
37 -0.1672818 1.9221558 0.17548741
38 0.8562938 0.1899648 0.104625865
39 0.2136326 1.6611409 0.187477344
40 -0.1804743 -1.6048863 -0.178536056

Related

How to pull out only one beta coefficient for each group separately from the regression equation and calculate mean by variables in R

here little example of my data.
sales.data=structure(list(MDM_Key = c(370L, 370L, 370L, 370L, 370L, 370L,
370L, 371L, 371L, 371L, 371L, 371L, 371L, 371L), sale_count = c(30L,
32L, 32L, 24L, 20L, 15L, 23L, 30L, 32L, 32L, 24L, 20L, 15L, 23L
), iek_disc_price = c(38227.08, 38227.08, 33739.7, 38227.08,
38227.08, 28844.16, 31649.255, 38227.08, 38227.08, 33739.7, 38227.08,
38227.08, 28844.16, 31649.255)), class = "data.frame", row.names = c(NA,
-14L))
i perform regression analysis
str(sales.data)
m1<-lm(formula=sale_count~iek_disc_price,data=sales.data)
summary(m1)
But the main difficulty is that for each group (MDM_Key) I don't need all the regression results from the summary, but only one beta coefficient.
here
B=0.0008559.
but then i need calculate mean value for sale_count and the mean for iek_disc_price (also for each mdm key group)
so the desired result would be like this
MDM_Key beta mean(sale_count) mean(iek_disc_price)
370 0.0008559 25.14 35305
371 0.0008559 25.14 35305
How to take only beta (nor intercept)regression coefficient for each group mdm_key
and also for each group, calculate the mean values for sale_count and iek_disc_price to get the summary table indicated above.
Thank you for your help.

If I understood correctly, you want to apply one regression per MDM_Key.
library(dplyr)
library(purrr)
library(broom)
sales.data %>%
group_by(MDM_Key) %>%
mutate(
mean_sale_count = mean(sale_count),
mean_iek_disc_price = mean(iek_disc_price)
) %>%
nest(-MDM_Key,-mean_sale_count,-mean_iek_disc_price) %>%
mutate(
coefs = map(.x = data,.f = ~tidy(lm(formula=.$sale_count~.$iek_disc_price,data=.)))
) %>%
unnest(coefs) %>%
filter(term != "(Intercept)") %>%
select(MDM_Key,beta = estimate,mean_sale_count,mean_iek_disc_price)
# A tibble: 2 x 4
# Groups: MDM_Key [2]
MDM_Key beta mean_sale_count mean_iek_disc_price
<int> <dbl> <dbl> <dbl>
1 370 0.000856 25.1 35306.
2 371 0.000856 25.1 35306.

Using R base and the split + apply + combine strategy:
do.call(rbind, lapply(split(sales.data, sales.data$MDM_Key), function(i) {
c(beta=coef(lm(sale_count~iek_disc_price, data=i))[2],
sale_count_mean=mean(i$sale_count),
iek_disc_price_mean=mean(i$iek_disc_price))
} ))
beta.iek_disc_price sale_count_mean iek_disc_price_mean
370 0.0008558854 25.14286 35305.92
371 0.0008558854 25.14286 35305.92

Get the means using aggregate and the beta values using lmList and then put them together and rearrange the columns in the order shown in the question. Omit [, c(2:1, 3:4)] if the column order doesn't matter. Note that nlme comes with R and does not have to be installed.
library(nlme) # lmList
means <- aggregate(. ~ MDM_Key, sales.data, mean)
fm <- lmList(sale_count ~ iek_disc_price | MDM_Key, sales.data)
cbind(beta = coef(fm)[, 2], means)[, c(2:1, 3:4)]
## MDM_Key beta sale_count iek_disc_price
## 1 370 0.0008558854 25.14286 35305.92
## 2 371 0.0008558854 25.14286 35305.92

Intersect buffer with vector to extract information

This is my first attempt at doing GIS analysis in R so hopefully what I've done so far and my question makes sense.
I have a data frame of points within the UK which I have converted to a sf object and calculated a 250m buffer around each point. I also have some land cover vector data which I've also converted into a sf object. The land cover data is split into land cover types.
I would like to calculate the percentage of different land cover types found within each 250m buffer. I've managed to get what I think is the vector data for each buffer, but I'm not sure how to extract the information (i.e. area/proportion of each land cover type) from each buffer.
This is what I've got so far:
XY data below
require(rgdal)
library(mapview)
library(sf)
sites <- structure(list(plot = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L,
23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L,
36L, 37L, 38L, 39L, 40L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 46L,
47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 55L, 56L, 57L, 58L,
59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L,
72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L,
85L, 86L, 87L, 88L, 89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L,
98L, 99L, 100L, 101L, 102L, 103L, 104L, 105L, 106L, 107L, 108L,
109L, 110L, 111L, 112L, 113L, 114L, 115L, 116L, 117L, 118L, 119L,
120L, 171L, 172L, 173L, 174L, 175L, 176L, 177L, 178L, 179L, 180L,
181L, 182L, 183L, 184L, 185L, 186L, 187L, 188L, 189L, 190L),
gridref = c("ST7498436114", "ST7492436114", "ST7486436114",
"ST7528433234", "ST7516436174", "ST7498436174", "ST7522433294",
"ST7468435994", "ST7516436054", "ST7504436054", "ST7498436054",
"ST7576432994", "ST7576533004", "ST7492436054", "ST7486436054",
"ST7480436054", "ST7522436114", "ST7474435934", "ST7468435934",
"ST7570433114", "ST7504435994", "ST7582432874", "ST7498435994",
"ST7576432874", "ST7492435994", "ST7570432874", "ST7486435994",
"ST7480435994", "ST7474435994", "ST7564435094", "ST7558435094",
"ST7546432934", "ST7420435514", "ST7420435574", "ST7426435694",
"ST7576432814", "ST7426435754", "ST7420435754", "ST7426435814",
"ST7486435025", "ST7612432874", "ST7607132839", "ST7588435034",
"ST7582435034", "ST7576435034", "ST7522435034", "ST7498435034",
"ST7582432754", "ST7581632758", "ST7576435094", "ST7570435094",
"ST7570432754", "ST7516434914", "ST7564432754", "ST7510434914",
"ST7489834968", "ST7594434974", "ST7594432694", "ST7597532791",
"ST7588434974", "ST7528434974", "ST7522434974", "ST7504434974",
"ST7606434854", "ST7546434854", "ST7552432694", "ST7540434854",
"ST7522434854", "ST7492434854", "ST7600434914", "ST7540434914",
"ST7618432694", "ST7534434914", "ST7558434734", "ST7552434734",
"ST7504434734", "ST7498434734", "ST7576432574", "ST7606434794",
"ST7564432574", "ST7558434794", "ST7546434794", "ST7496634827",
"ST7618432634", "ST7630434494", "ST7624434554", "ST7522434554",
"ST7516434554", "ST7618434614", "ST7618434674", "ST7612434674",
"ST7564434734", "ST7648434314", "ST7648434374", "ST7570432454",
"ST7642434374", "ST7642434434", "ST7636434434", "ST7636434494",
"ST7576432394", "ST7522433714", "ST7588432274", "ST7516433774",
"ST7522433594", "ST7540433534", "ST7546433474", "ST7534433474",
"ST7528433474", "ST7492436174", "ST7522436234", "ST7486436234",
"ST7534433354", "ST7480436234", "ST7498436294", "ST7516433354",
"ST7492436294", "ST7486436294", "ST7516436114", "ST7504436114",
"ST7492433834", "ST7498433894", "ST7504433714", "ST7504433474",
"ST7576432634", "ST7527336046", "ST7533436050", "ST7496835930",
"ST7491135929", "ST7536336131", "ST7532436239", "ST7546033412",
"ST7533433668", "ST7546733054", "ST7546033171", "ST7460735809",
"ST7455135810", "ST7492533597", "ST7516133894", "ST7611032751",
"ST7599532629", "ST7563133004", "ST7558533053", "ST7594132635",
"ST7600032936"), lon = c(-2.35882798600066, -2.35968529791987,
-2.36054260970957, -2.35434318476877, -2.35626020256111,
-2.35883216905146, -2.35520415532218, -2.36310607872696,
-2.35625189650525, -2.35796650101658, -2.35882380307907,
-2.34747232387985, -2.34745871838488, -2.35968110501236,
-2.36053840681614, -2.36139570849011, -2.35539873703486,
-2.36224456469475, -2.36310184613728, -2.34833723210959,
-2.35796232820981, -2.3466074559054, -2.35881962028668, -2.34746423080273,
-2.35967691223436, -2.34832100557515, -2.36053420405253,
-2.36139149574089, -2.36224878729914, -2.34932832124224,
-2.35018546484601, -2.35175219973381, -2.36992990937229,
-2.36993422091177, -2.36908560394007, -2.34746018445156,
-2.36908990589506, -2.36994715632925, -2.36909420798293,
-2.36046634739346, -2.34232357955569, -2.34307806730496,
-2.3458957145579, -2.34675284868196, -2.34760998268123, -2.35532418300805,
-2.3587527131752, -2.34659938326637, -2.34671388643665, -2.34761403365783,
-2.34847117751274, -2.34831289305932, -2.35617301455928,
-2.34916964776812, -2.357030127205, -2.35997665493164, -2.34503455939921,
-2.3448818567799, -2.34444568960825, -2.3458916836673, -2.35446291945093,
-2.35532004233278, -2.35789141020988, -2.34331230896304,
-2.35188334891386, -2.35087907083258, -2.35274045221757,
-2.35531176136585, -2.35959727405167, -2.34417342420112,
-2.35274456269774, -2.34145487458687, -2.3536016758543, -2.35016098122337,
-2.35101806482134, -2.35787472903205, -2.35873181148214,
-2.34744400029595, -2.34330830838447, -2.34915745002822,
-2.35016506151211, -2.35187924854011, -2.35899542001203,
-2.34145089838956, -2.33986012737858, -2.34072114258117,
-2.35529106118602, -2.35614811408587, -2.34158217774011,
-2.34158615799082, -2.342443232856, -2.3493038974994, -2.33727720150278,
-2.33728113125349, -2.34829261396095, -2.33813815683999,
-2.33814209668946, -2.33899913213172, -2.33900308208057,
-2.34743186349066, -2.35523311768249, -2.34571042235175,
-2.35609417888437, -2.35522484208367, -2.35265005375973,
-2.3517890723444, -2.35350281992438, -2.35435969352383, -2.3596894909569,
-2.35540702337447, -2.3605510158859, -2.35349458523648, -2.36140834751853,
-2.35884053554068, -2.35606514581882, -2.35969787741951,
-2.36055521916881, -2.35625604946904, -2.35797067395225,
-2.35952605843373, -2.35867330390757, -2.35780385700866,
-2.35778718664423, -2.3474480461475, -2.35469391232931, -2.3538225975963,
-2.35904376714229, -2.35985811443216, -2.35341379671676,
-2.35397847963346, -2.3518419625423, -2.35365895139517, -2.35171755255592,
-2.35182550302169, -2.3641931796262, -2.36499336018916, -2.35949523507335,
-2.35614532235317, -2.34251531682126, -2.34414928167033,
-2.34937223011837, -2.35003243626169, -2.34492074441706,
-2.34409838966173), lat = c(51.1237571019443, 51.123754466421,
51.1237518246133, 51.0978735338664, 51.1243044830452, 51.1242966140316,
51.0984104468623, 51.1226648376649, 51.123225458519, 51.1232202189951,
51.1232175898066, 51.0957360604064, 51.0958260221032, 51.1232149543339,
51.1232123125767, 51.1232096645353, 51.1237675811927, 51.1221279861883,
51.12212532568, 51.0968125396748, 51.1226807067568, 51.0946595744632,
51.1226780776186, 51.0946570295548, 51.1226754421963, 51.0946544783684,
51.1226728004898, 51.122670152499, 51.1226674982241, 51.1146139554378,
51.1146113898789, 51.0951837260487, 51.1183272329659, 51.1188667448915,
51.1199484791649, 51.0941175140535, 51.1204879909912, 51.1204852803661,
51.1210275027673, 51.1139596733116, 51.0946722048351, 51.0943552673697,
51.1140846411133, 51.1140821007327, 51.1140795540699, 51.1140563514,
51.1140458757624, 51.0935805433128, 51.0936161720752, 51.1146190677084,
51.1146165147143, 51.0935754474132, 51.1129747154744, 51.0935728900467,
51.1129720998063, 51.1134486332581, 51.1135476613768, 51.0930460983536,
51.0939196213418, 51.1135451273271, 51.1135194413093, 51.1135168381554,
51.1135089910009, 51.1124736826157, 51.1124481860401, 51.0930282410747,
51.1124456018318, 51.1124378115152, 51.1124247020154, 51.1130106752108,
51.1129851153262, 51.0930561644051, 51.1129825247864, 51.1113743081748,
51.1113717366288, 51.1113509381199, 51.1113483100386, 51.0919594515448,
51.1119341685333, 51.091954343383, 51.111913821918, 51.1119086724458,
51.1121837658749, 51.0925166484142, 51.1092466189503, 51.1097836374543,
51.1097402440333, 51.109737634947, 51.1103206495309, 51.1108601638605,
51.1108576551798, 51.111376873439, 51.1076355248773, 51.1081750396969,
51.0908778691438, 51.1081725626614, 51.1087120773832, 51.1087095940192,
51.1092491086431, 51.0903409041346, 51.1021870483868, 51.0892669548417,
51.1027239542518, 51.1011080196316, 51.1005762922147, 51.1000393606328,
51.1000341882122, 51.1000315925832, 51.1242939784578, 51.1248466057182,
51.1248308485354, 51.0989551588552, 51.1248282003419, 51.1253756380551,
51.0989473534293, 51.1253730023804, 51.125370360421, 51.1237649708073,
51.1237597311831, 51.1032529717066, 51.1037951193141, 51.1021792043864,
51.1000211472752, 51.0924989672475, 51.1231582609789, 51.1231968706504,
51.1221018957455, 51.1220903988326, 51.1239264662945, 51.1248959019189,
51.0994816900486, 51.1017781863846, 51.0962628852802, 51.0973146386257,
51.1209979188271, 51.1210044144558, 51.1011219350515, 51.1038028520133,
51.0935656115046, 51.092463770299, 51.0958203158045, 51.0962589537862,
51.092515448027, 51.0952245028064)), class = "data.frame", row.names = c(NA,
-144L))
Convert XY data to sf object and create buffer
sites2 = st_as_sf(sites,coords=c("lon","lat"), dim = "XY", crs=4326)
coords <- st_geometry(sites2)
st_crs(coords)
buff_coords <- st_buffer(coords, dist = 250)
mapview(coords) + mapview(buff_coords)
Land cover data here
Read in land cover data and use st_intersection to get land cover data within each buffer
land_cover <- st_read(
"land_cover.shp")
buf_intersects <- st_intersection(buff_coords, land_cover)
buff <- st_as_sf(buf_intersects) # convert to sf object
mapview(coords, col.regions="red", cex=2) + mapview(buff)
This is where I've got to. It looks correct, each buffer has the vector lines within it.
What I'm struggling with is how to extract information from the buff object. I would like either area or proportion of each land cover type.
Any ideas? Thanks!

What you need to consider is that the output of st_intersection() keeps the attributes of x whereas y is only used for clipping. Since buff_coords is of type geometry set without any attributes, your output only includes geometry information. That's why your result looks correct. What is missing, and what you are interested in, are all the attributes to calculate areas, so simply switch the inputs:
st_intersection(land_cover, buff_coords)
#> Simple feature collection with 2153 features and 12 fields
#> Geometry type: GEOMETRY
#> Dimension: XY
#> Bounding box: xmin: -2.373569 ymin: 51.087 xmax: -2.333649 ymax: 51.12766
#> Geodetic CRS: WGS 84
#> First 10 features:
#> [...]
In order to determine proportions of land cover classes you might want to calculate areas of your classes within the buffer per site and use aggregate() to sum areas of the same class in order to get the weights desired.
I'm not quite sure if you can do this operation all at once, so I'll just show the workflow for the first site feature so that you should be able to construct a for-loop in order to iterate over all your features:
# clipping lc data with buffer of first site
x <- st_intersection(land_cover, sites[1,] |> st_geometry() |> st_buffer(dist = 250))
# calculate area
x["area"] <- st_area(x)
# sum area per INTCODE
aggregate(area ~ INTCODE, x, sum)
#> INTCODE area
#> 1 1 49098.67741 [m^2]
#> 2 2 148094.15274 [m^2]
#> 3 3 2090.11273 [m^2]
#> 4 4 55.95573 [m^2]
#> 5 5 41.40524 [m^2]
Edit:
I also noticed the area does not quite add up to pi * 250^2 and I guess this is a result of using st_buffer() and st_area() on a dataset with a geodetic coordinate reference system (EPSG: 4326). In order to reproject your data (land_cover & sites) to a commomly used projected system in your area of interest - I used WGS 84 / UTM zone 30N (EPSG: 32630) here, but you'll probably know better - you can make use of st_transform() resulting in desired values:
a1 <- sites |> st_buffer(dist = 250) |> st_area()
head(a1)
#> Units: [m^2]
#> [1] 199380.3 199244.6 199278.6 199276.3 199343.5 199301.7
a2 <- sites |> sf::st_transform("epsg:32630") |> st_buffer(dist = 250) |> st_area()
head(a2)
#> Units: [m^2]
#> [1] 196259.8 196259.8 196259.8 196259.8 196259.8 196259.8
However, since you seem to be interested in proportions only and not absolute areas per se, I would not expect significant distortions here based on your radius chosen - but I'll probably still go for a suitable projected reference system.

subset a r dataframe based on conditions

I have a dataframe like this:
From this, I want to delete the rows which have same elements repeated in the multiple columns: assemblyName, qseqid, sseqid. At the same time, I want to keep the rows even if they are repeated but if there is an other row/rows with same assemblyName but have different qseqid and sseqid.
For example, from the above image: batch2_21032019_ENT924_assembly.fasta in the assemblyName has 4 records (rows: 747,748,771,785). If row 771,785 were not there, I would remove rows 747,748 which contains same assemblyName, qseqid and sseqid. But since here, there are row 771 and 785 which have different qseqid and sseqid, I wish to retain all the 4 rows.
But the last few rows from 1422 to 1503, I do not want to keep them because they are repeated in the columns assemblyName, qseqid, sseqid
Basically, what I want is a dataframe with the following output:
How do I achieve this in R? Here is my dput snippet:
structure(list(assemblyName = structure(c(5L, 12L, 12L, 24L,
24L, 42L, 48L, 48L, 48L, 48L, 76L, 76L, 76L, 79L, 79L, 79L), .Label = c("batch1_08032019_ENT1252_assembly.fasta",
"batch1_08032019_ENT1350_assembly.fasta", "batch1_08032019_ENT1368_assembly.fasta",
"batch1_08032019_ENT1382_assembly.fasta", "batch1_08032019_ENT1420_assembly.fasta",
"batch1_08032019_ENT1458_assembly.fasta", "batch1_08032019_ENT1489_assembly.fasta",
"batch14_02082019_ENT1646_assembly.fasta", "batch2_21032019_ENT1079_assembly.fasta",
"batch2_21032019_ENT1192_assembly.fasta", "batch2_21032019_ENT1219_assembly.fasta",
"batch2_21032019_ENT1250_assembly.fasta", "batch2_21032019_ENT1357_assembly.fasta",
"batch2_21032019_ENT1440_assembly.fasta", "batch2_21032019_ENT1669_assembly.fasta",
"batch2_21032019_ENT1758_assembly.fasta", "batch2_21032019_ENT1916_assembly.fasta",
"batch2_21032019_ENT1940_assembly.fasta", "batch2_21032019_ENT1968_assembly.fasta",
"batch2_21032019_ENT256_assembly.fasta", "batch2_21032019_ENT264_assembly.fasta",
"batch2_21032019_ENT267_assembly.fasta", "batch2_21032019_ENT268_assembly.fasta",
"batch2_21032019_ENT285_assembly.fasta", "batch2_21032019_ENT3_assembly.fasta",
"batch2_21032019_ENT310_assembly.fasta", "batch2_21032019_ENT312_assembly.fasta",
"batch2_21032019_ENT337_assembly.fasta", "batch2_21032019_ENT341_assembly.fasta",
"batch2_21032019_ENT358_assembly.fasta", "batch2_21032019_ENT360_assembly.fasta",
"batch2_21032019_ENT378_assembly.fasta", "batch2_21032019_ENT385_assembly.fasta",
"batch2_21032019_ENT421_assembly.fasta", "batch2_21032019_ENT422_assembly.fasta",
"batch2_21032019_ENT423_assembly.fasta", "batch2_21032019_ENT454_assembly.fasta",
"batch2_21032019_ENT465_assembly.fasta", "batch2_21032019_ENT466_assembly.fasta",
"batch2_21032019_ENT473_assembly.fasta", "batch2_21032019_ENT497_assembly.fasta",
"batch2_21032019_ENT5_assembly.fasta", "batch2_21032019_ENT50_assembly.fasta",
"batch2_21032019_ENT595_assembly.fasta", "batch2_21032019_ENT607_assembly.fasta",
"batch2_21032019_ENT708_assembly.fasta", "batch2_21032019_ENT807_assembly.fasta",
"batch2_21032019_ENT924_assembly.fasta", "batch20_11102019_ENT1249_assembly.fasta",
"batch20_11102019_ENT783_assembly.fasta", "batch20_11102019_ENT784_assembly.fasta",
"batch20_11102019_ENT785_assembly.fasta", "batch20_11102019_ENT835_assembly.fasta",
"batch20_11102019_ENT849_assembly.fasta", "batch20_11102019_ENT897_assembly.fasta",
"batch20_11102019_ENT901_assembly.fasta", "batch20_11102019_ENT903_assembly.fasta",
"batch20_11102019_ENT912_assembly.fasta", "batch20_11102019_ENT916_assembly.fasta",
"batch20_11102019_ENT938_assembly.fasta", "batch20_11102019_ENT965_assembly.fasta",
"batch20_11102019_ENT981_assembly.fasta", "batch20_11102019_ENT983_assembly.fasta",
"batch20_11102019_ENT990_assembly.fasta", "batch21x_16102019_ENT1251_assembly.fasta",
"batch21x_16102019_ENT1262_assembly.fasta", "batch21x_16102019_ENT1263_assembly.fasta",
"batch21x_16102019_ENT1266_assembly.fasta", "batch21x_16102019_ENT1267_assembly.fasta",
"batch21x_16102019_ENT1271_assembly.fasta", "batch21x_16102019_ENT1274_assembly.fasta",
"batch21x_16102019_ENT1276_assembly.fasta", "batch21x_16102019_ENT1278_assembly.fasta",
"batch21x_16102019_ENT1279_assembly.fasta", "batch21x_16102019_ENT1280_assembly.fasta",
"batch21x_16102019_ENT1288_assembly.fasta", "batch21x_16102019_ENT1296_assembly.fasta",
"batch21x_16102019_ENT1300_assembly.fasta", "batch21x_16102019_ENT1321_assembly.fasta",
"batch21x_16102019_ENT1322_assembly.fasta", "batch21x_16102019_ENT1325_assembly.fasta",
"batch21x_16102019_ENT1330_assembly.fasta", "batch21x_16102019_ENT1384_assembly.fasta",
"batch21x_16102019_ENT1393_assembly.fasta", "batch21x_16102019_ENT1394_assembly.fasta",
"batch21x_16102019_ENT1396_assembly.fasta", "batch21x_16102019_ENT1465_assembly.fasta",
"batch21x_16102019_ENT1502_assembly.fasta", "batch21x_16102019_ENT1570_assembly.fasta",
"batch21x_16102019_ENT1599_assembly.fasta", "batch21x_16102019_ENT1649_assembly.fasta",
"batch21x_16102019_ENT1676_assembly.fasta", "batch21x_16102019_ENT1681_assembly.fasta",
"batch21x_16102019_ENT1691_assembly.fasta", "batch21x_16102019_ENT1837_assembly.fasta",
"batch21x_16102019_ENT1895_assembly.fasta", "batch21x_16102019_ENT1896_assembly.fasta",
"batch21x_16102019_ENT1929_assembly.fasta", "batch21x_16102019_ENT1941_assembly.fasta",
"batch21x_16102019_ENT209_assembly.fasta", "batch21x_16102019_ENT689_assembly.fasta",
"batch21x_16102019_ENT732_assembly.fasta", "batch21x_16102019_ENT790_assembly.fasta",
"batch22_18102019_ENT1331_assembly.fasta", "batch22_18102019_ENT1336_assembly.fasta",
"batch22_18102019_ENT1337_assembly.fasta", "batch22_18102019_ENT1352_assembly.fasta",
"batch22_18102019_ENT1359_assembly.fasta", "batch22_18102019_ENT1413_assembly.fasta",
"batch22_18102019_ENT1475_assembly.fasta", "batch22_18102019_ENT1515_assembly.fasta",
"batch22_18102019_ENT1559_assembly.fasta", "batch22_18102019_ENT1580_assembly.fasta",
"batch22_18102019_ENT1595_assembly.fasta"), class = "factor"),
qseqid = structure(c(107L, 71L, 89L, 109L, 122L, 119L, 19L,
19L, 69L, 117L, 61L, 61L, 61L, 72L, 72L, 72L), .Label = c("",
"1_length=4775743_depth=1.00x_circular=true", "1_length=4782442_depth=1.00x_circular=true",
"1_length=4798941_depth=1.00x_circular=true", "1_length=4811272_depth=1.00x_circular=true",
"1_length=4854518_depth=1.00x_circular=true", "1_length=4870013_depth=1.00x",
"1_length=4877560_depth=1.00x_circular=true", "1_length=4879405_depth=1.00x_circular=true",
"1_length=4880726_depth=1.00x_circular=true", "1_length=4910657_depth=1.00x_circular=true",
"1_length=4945396_depth=1.00x_circular=true", "1_length=4980803_depth=1.00x_circular=true",
"1_length=4995045_depth=1.00x_circular=true", "1_length=4995093_depth=1.00x_circular=true",
"1_length=5004019_depth=1.00x_circular=true", "1_length=5024487_depth=1.00x_circular=true",
"1_length=5386431_depth=1.00x_circular=true", "1_length=5418220_depth=1.00x_circular=true",
"10_length=167596_depth=0.99x_circular=true", "10_length=41259_depth=2.09x_circular=true",
"19_length=13505_depth=0.90x", "2_length=123974_depth=3.35x_circular=true",
"2_length=174608_depth=2.06x_circular=true", "2_length=177751_depth=2.86x_circular=true",
"2_length=258181_depth=1.64x_circular=true", "2_length=278408_depth=1.57x_circular=true",
"2_length=41183_depth=3.34x_circular=true", "2_length=41190_depth=5.16x_circular=true",
"2_length=41215_depth=3.01x_circular=true", "2_length=41217_depth=2.25x_circular=true",
"2_length=71861_depth=0.77x_circular=true", "2_length=71861_depth=2.89x_circular=true",
"2_length=72968_depth=0.51x_circular=true", "2_length=91069_depth=1.21x_circular=true",
"2_length=91643_depth=2.11x_circular=true", "2_length=92072_depth=0.81x_circular=true",
"20_length=5469_depth=1.62x", "22_length=90789_depth=1.44x_circular=true",
"3_length=112875_depth=0.98x_circular=true", "3_length=118064_depth=3.79x_circular=true",
"3_length=127528_depth=1.73x_circular=true", "3_length=164596_depth=1.02x_circular=true",
"3_length=165091_depth=1.16x_circular=true", "3_length=165095_depth=2.12x_circular=true",
"3_length=165543_depth=0.59x_circular=true", "3_length=174323_depth=1.93x_circular=true",
"3_length=174796_depth=0.74x_circular=true", "3_length=180232_depth=1.88x_circular=true",
"3_length=180817_depth=1.81x_circular=true", "3_length=38610_depth=3.37x_circular=true",
"3_length=41182_depth=3.37x_circular=true", "3_length=41182_depth=4.04x_circular=true",
"3_length=41184_depth=4.98x_circular=true", "3_length=41185_depth=5.84x_circular=true",
"3_length=41186_depth=3.26x_circular=true", "3_length=41232_depth=2.49x_circular=true",
"3_length=50138_depth=1.79x_circular=true", "3_length=58175_depth=0.39x",
"3_length=62334_depth=2.76x_circular=true", "3_length=67915_depth=0.42x",
"3_length=71861_depth=2.39x_circular=true", "3_length=71861_depth=2.99x_circular=true",
"3_length=72145_depth=0.97x_circular=true", "3_length=72168_depth=0.80x_circular=true",
"3_length=731673_depth=1.22x", "3_length=74789_depth=2.02x_circular=true",
"3_length=74794_depth=2.26x_circular=true", "3_length=75214_depth=2.77x_circular=true",
"3_length=79594_depth=1.46x_circular=true", "3_length=88353_depth=2.00x_circular=true",
"3_length=89872_depth=0.49x_circular=true", "3_length=90666_depth=2.61x_circular=true",
"3_length=96544_depth=1.98x_circular=true", "38_length=14280_depth=2.50x",
"39_length=41187_depth=6.10x_circular=true", "4_length=129927_depth=0.88x",
"4_length=161129_depth=0.64x_circular=true", "4_length=165104_depth=0.58x_circular=true",
"4_length=170202_depth=0.80x", "4_length=41182_depth=1.27x_circular=true",
"4_length=41186_depth=4.34x_circular=true", "4_length=41188_depth=2.88x_circular=true",
"4_length=41190_depth=2.44x_circular=true", "4_length=41190_depth=3.46x_circular=true",
"4_length=41215_depth=3.66x_circular=true", "4_length=41224_depth=2.50x_circular=true",
"4_length=46161_depth=2.45x_circular=true", "4_length=51479_depth=1.11x_circular=true",
"4_length=71795_depth=2.16x_circular=true", "4_length=71859_depth=1.18x_circular=true",
"4_length=71861_depth=0.80x_circular=true", "4_length=71861_depth=1.56x_circular=true",
"4_length=71861_depth=1.95x_circular=true", "4_length=71861_depth=3.09x_circular=true",
"4_length=71861_depth=3.28x_circular=true", "4_length=71868_depth=0.67x_circular=true",
"4_length=71875_depth=0.43x_circular=true", "4_length=72162_depth=0.61x_circular=true",
"4_length=72162_depth=1.28x_circular=true", "4_length=73397_depth=1.60x_circular=true",
"4_length=73399_depth=2.01x_circular=true", "4_length=88057_depth=1.72x_circular=true",
"46_length=5494_depth=4.49x", "5_length=110787_depth=5.28x_circular=true",
"5_length=41185_depth=3.00x_circular=true", "5_length=41190_depth=2.13x_circular=true",
"5_length=42336_depth=2.31x_circular=true", "5_length=46161_depth=2.20x_circular=true",
"5_length=51479_depth=1.02x_circular=true", "5_length=51479_depth=2.10x_circular=true",
"5_length=55129_depth=3.86x_circular=true", "5_length=6141_depth=16.45x_circular=true",
"5_length=62044_depth=5.10x", "5_length=6211_depth=4.24x_circular=true",
"5_length=65498_depth=0.98x_circular=true", "5_length=70472_depth=2.31x",
"5_length=71861_depth=1.24x_circular=true", "6_length=41190_depth=4.77x_circular=true",
"6_length=46161_depth=0.86x_circular=true", "6_length=71861_depth=2.24x_circular=true",
"6_length=7604_depth=3.49x_circular=true", "6_length=80977_depth=0.65x_circular=true",
"6_length=95567_depth=1.42x_circular=true", "64_length=6420_depth=2.15x"
), class = "factor"), sseqid = c("NDM-1", "NDM-5", "OXA-181",
"NDM-5", "OXA-181", "NDM-1", "OXA-181", "OXA-181", "NDM-5",
"NDM-5", "OXA-181", "OXA-181", "OXA-181", "OXA-181", "OXA-181",
"OXA-181"), qlen = c(41190L, 88353L, 51479L, 46161L, 7604L,
41190L, 5418220L, 5418220L, 75214L, 70472L, 67915L, 67915L,
67915L, 89872L, 89872L, 89872L), qstart = c(23131L, 14408L,
25135L, 25547L, 5873L, 23131L, 5244180L, 4252066L, 36917L,
20047L, 51138L, 44729L, 38320L, 4678L, 11087L, 88141L), qend = c(23943L,
15220L, 25932L, 26359L, 6670L, 23943L, 5244977L, 4252863L,
37729L, 20859L, 51935L, 45526L, 39117L, 5475L, 11884L, 88938L
)), .Names = c("assemblyName", "qseqid", "sseqid", "qlen",
"qstart", "qend"), row.names = c(78L, 209L, 223L, 389L, 403L,
656L, 747L, 748L, 771L, 785L, 1422L, 1423L, 1424L, 1501L, 1502L,
1503L), class = "data.frame")

We can create a key column combining qseqid and sseqid and then select those assemblyName who either have more one distinct value of key or have only one row in them.
library(dplyr)
df %>%
mutate(key = paste0(qseqid, sseqid)) %>%
group_by(assemblyName) %>%
filter(n_distinct(key) > 1 | n() == 1) %>%
select(-key)
# assemblyName qseqid sseqid qlen qstart end
# <fct> <fct> <chr> <int> <int> <int>
# 1 batch1_08032019_ENT1420_assembly.fasta 5_length=41190_depth=2.13x_circular=true NDM-1 41190 23131 23943
# 2 batch2_21032019_ENT1250_assembly.fasta 3_length=88353_depth=2.00x_circular=true NDM-5 88353 14408 15220
# 3 batch2_21032019_ENT1250_assembly.fasta 4_length=51479_depth=1.11x_circular=true OXA-181 51479 25135 25932
# 4 batch2_21032019_ENT285_assembly.fasta 5_length=46161_depth=2.20x_circular=true NDM-5 46161 25547 26359
# 5 batch2_21032019_ENT285_assembly.fasta 6_length=7604_depth=3.49x_circular=true OXA-181 7604 5873 6670
# 6 batch2_21032019_ENT5_assembly.fasta 6_length=41190_depth=4.77x_circular=true NDM-1 41190 23131 23943
# 7 batch2_21032019_ENT924_assembly.fasta 1_length=5418220_depth=1.00x_circular=true OXA-181 5418220 5244180 5244977
# 8 batch2_21032019_ENT924_assembly.fasta 1_length=5418220_depth=1.00x_circular=true OXA-181 5418220 4252066 4252863
# 9 batch2_21032019_ENT924_assembly.fasta 3_length=75214_depth=2.77x_circular=true NDM-5 75214 36917 37729
#10 batch2_21032019_ENT924_assembly.fasta 5_length=70472_depth=2.31x NDM-5 70472 20047 20859

We can also use str_c
library(dplyr)
library(stringr)
library(dplyr)
df %>%
mutate(key = str_c(qseqid, sseqid)) %>%
group_by(assemblyName) %>%
slice(which(n_distinct(key) > 1 | n() == 1)) %>%
select(-key)

Distance between points in R: distCosine() for a table?

I have a list of position and I would like to know the distances between the closest points. I tried to use distCosine() but there is an issue. Here is what I did:
my data, sorted by lat
structure(list(lat = c(53.56478, 53.919724, 54.109047, 54.109047,
54.36612, 55.48143, 56.2335, 56.682796, 56.93616, 57.804092,
58.82089, 59.297623, 59.335075, 59.907795, 60.125046, 60.274445,
60.289204, 60.386665, 60.591167, 64.68329), long = c(14.585611,
14.286517, 13.807847, 13.807847, 10.997632, 18.182697, 16.454927,
16.564703, 18.221214, 23.258204, 17.84381, 18.172949, 18.126884,
23.217615, 20.65724, 26.44062, 27.189545, 19.847534, 28.5585,
24.534185)), .Names = c("lat", "long"), row.names = c(2L, 3L,
6L, 11L, 1L, 17L, 15L, 20L, 13L, 19L, 7L, 14L, 4L, 5L, 10L, 12L,
18L, 9L, 8L, 16L), class = "data.frame")
I tried to use distCosine() following an other discussion on stackoverflow to include in a new column the distance from the closest lat (this is why I sorted by lat):
data$a<-outer(seq(nrow(data)),
seq(nrow(data)),
Vectorize(function(i, j) distCosine(data[1,], data[2,]))
)
The result does not work... This is not the distance for each point...
is there an easier way to use distCosine for my request?

I think you just have to replace distCosine(data[1,], data[2,]) by distCosine(data[i,c("long","lat")], data[j,c("long","lat")]):
data <- head(data,5) # smaller example
data$a<-outer( seq(nrow(data)),
seq(nrow(data)),
Vectorize(
function(i, j) distCosine(data[i,c("long","lat")], data[j,c("long","lat")])
)
)
Result:
> data
lat long a.1 a.2 a.3 a.4 a.5
2 53.56478 14.58561 0.00 44146.92 79251.87 79251.87 251291.54
3 53.91972 14.28652 44146.92 0.00 37741.81 37741.81 220118.16
6 54.10905 13.80785 79251.87 37741.81 0.00 0.00 185040.01
11 54.10905 13.80785 79251.87 37741.81 0.00 0.00 185040.01
1 54.36612 10.99763 251291.54 220118.16 185040.01 185040.01 0.00
>

Got it with an other function:
data<-data[c("long","lat")]
distHaversine
t<-distHaversine(p1 = data[-nrow(data),],
p2 = data[-1,]) a<-0 final<-c(a,t) data$dist<-final
a<-0
final<-c(a,t)
data$dist<-final

Plot frequency of events with time in R

I have a reasonable amount of time data, and I'd like to put it in a frequency graph, where the X-axis would be several intervals of time and the Y-axis would be the amount of data I've collected in such period. See this example:
Let's suppose I have this list:
[10:17:55, 10:37:40, 10:40:26, 10:48:18, 11:00:17, 11:01:12, 11:06:58, 11:09:20, 11:43:41, 11:48:24, 11:49:14, 12:07:31, 12:10:52, 12:10:52, 12:19:00, 12:19:00, 12:19:43, 12:20:55, 12:38:27, 12:38:27, 12:55:09, 12:55:10, 12:57:31, 12:57:31, 13:04:16, 13:04:16, 13:06:51 13:06:51, 14:55:06, 14:56:10, 15:01:30, 15:28:42, 3:29:17, 15:35:33, 15:58:32, 16:05:07, 16:09:16, 16:10:36, 16:32:57, 16:32:57, 16:34:32, 16:38:16, 17:43:27, 17:53:01, 17:56:14, 18:08:21, 18:17:23, 18:37:23, 18:37:23, 18:43:13, 18:43:13 18:51:43, 18:51:43, 19:05:39, 19:05:39]
And I'd like to plot a histogram showing how many values are there in intervals of 1h, or 30 minutes (still deciding), such as:
10h - 11h: 4
11h - 12h: 7
.
.
.
19h - 20h: 2
But all that represented in a graph. I know the very basics of how to plot a histogram in R and couldn't figure out how to do that. I've seen some answers making plots throughout the days, which is not much applicable, because these values were collected in different days... Can you guys help me?
EDIT: Here's a dput() of the list:
structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L,
13L, 13L, 14L, 14L, 15L, 16L, 17L, 17L, 18L, 19L, 20L, 20L, 21L,
21L, 22L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L,
33L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 41L, 42L, 42L,
43L, 43L, 44L, 44L), .Label = c("10:17:55", "10:37:40", "10:40:26",
"10:48:18", "11:00:17", "11:01:12", "11:06:58", "11:09:20", "11:43:41",
"11:48:24", "11:49:14", "12:07:31", "12:10:52", "12:19:00", "12:19:43",
"12:20:55", "12:38:27", "12:55:09", "12:55:10", "12:57:31", "13:04:16",
"13:06:51", "14:55:06", "14:56:10", "15:01:30", "15:28:42", "15:29:17",
"15:35:33", "15:58:32", "16:05:07", "16:09:16", "16:10:36", "16:32:57",
"16:34:32", "16:38:16", "17:43:27", "17:53:01", "17:56:14", "18:08:21",
"18:17:23", "18:37:23", "18:43:13", "18:51:43", "19:05:39"), class = "factor")`

There are range, trunc and seq methods for POSIXt or Date objects. Assuming you assign that structure object to a name such as tms this would convert to POSIXct and then construct a range, a sequence of breaks that spanned the hours and then bin within 30 minute intervals:
> tms <- as.POSIXct(tms, format="%H:%M:%S")
> brks <- trunc(range(tms), "hours")
Warning message:
In if (isdst == -1) { :
the condition has length > 1 and only the first element will be used
> hist(tms, breaks=seq(brks[1], brks[2]+3600, by="30 min") )
Notice that the plot method for POSIXt objects handles the x-axis labeling:
I suppose you could check to see if the second "brks" was within the half-hour window for a 30 minute plot. So this would be the code to avoid a blank bin, if targeting half-hour bins:
hist(tms, breaks=seq(brks[1],
brks[2]+ if( as.numeric( max(tms)-brks[2] ) < 30) #diff time in mins
{1800} else{3600},
by="30 min")
)

Here is the method I used to obtain what it is you are after.
This will work for hours and half hours. Not the prettiest, but I think it serves your purpose. You will need to do some massaging of the axes so they display the information you desire. Hopefully that helps!
hours <- as.numeric( format( strptime( times , format = "%H:%M:%S" ) , "%H" ) )
hist( hours , breaks = unique( hours ) )
half_hours <- hours + ( as.numeric( format( strptime( times , format = "%H:%M:%S" ) , "%M" ) ) /60 )
hist(half_hours , breaks = c( unique( hours ) , unique( hours ) + 0.5 ) )

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

How to extract block of rows in R - r

Related

How to pull out only one beta coefficient for each group separately from the regression equation and calculate mean by variables in R

Intersect buffer with vector to extract information

subset a r dataframe based on conditions

Distance between points in R: distCosine() for a table?

Plot frequency of events with time in R

Categories

Resources