Check if a column contains a sequence - r

I want to know, if i can check out if a column of a data frame starts with 0 or 1 and goes till the number of rows without breaking the sequence. Below is a sample data frame.
structure(list(X = 1:22, SNR = c(1.0035798429, 11.9438978154,
NA, 3.2894877794, 4.0170266411, 1.6310522977, 1.6405414787, 1.6625412522,
0.8489116253, 7.5312259672, 7.2832910726, 0.5732577083, NA, 0.8149754292,
1.9981020389, 1.2477052103, 0.9960804911, 10.3402683931, 3.6328270728,
2.5540496855, 41.96873985, 6.2035281045), ID = c(109L, 110L,
111L, 112L, 113L, 114L, 116L, 117L, 118L, 119L, 120L, 121L, 123L,
124L, 125L, 126L, 127L, 128L, 130L, 131L, 132L, 133L), SignalIntensity = c(6.8173738339,
11.5459925418, NA, 9.7804203445, 9.8719842219, 9.0781857736,
8.2289312163, 8.0435364446, 6.1793458315, 10.5581798932, 10.4745329822,
4.1572943809, NA, 6.0451742752, 8.3100219509, 7.4558770659, 7.1464749962,
11.4284386394, 9.6273795753, 9.6807417299, 13.3364944397, 10.4304671876
)), .Names = c("X", "SNR", "ID", "SignalIntensity"), class = "data.frame", row.names = c(NA,
-22L))
How can i check the columns and return the index if present.
Edited: The sequence i am looking for is a natural sequence. Suppose if a data frame has 10 rows, the column if present should have a sequence 1,2,3,4,5,6,7,8,9,10 or may be like0,1,2,3,4,5,6,7,8,9. . So the sequence starts with 0 or 1 and goes till the number of rows with an increment of 1 for each row.

You could loop through the columns with sapply. Create a function to check whether there are any NAs. If not (!any), we get the difference (diff) between the adjacent element, check if all the element difference is 1 (all(diff(x)==1) and (&) the first value of the column is 0 or 1 (x[1] %in% 0:1). If there is any NA, the output for that column will be 'FALSE'.
f1 <- function(x) {
if(!any(is.na(x)))
all(diff(x)==1) & x[1] %in% 0:1
else FALSE}
which(sapply(df, f1))
#X
#1

Related

rounding up time in a date-time variable in R

I have been looking for a solution for a problem I thought is easy to solve but it doesn't seem so. I am trying roundup a DateTime Variable to the next 5 minutes.
Incidents<- structure(list(Event.Id = c(240252L, 240258L, 240261L, 240264L,
240294L, 240313L, 240314L, 240315L, 240331L, 240343L, 240350L,
240358L, 240379L, 240381L, 240396L), DateTime = structure(c(1427868300,
1427872800, 1427873100, 1427873700, 1427881500, 1427890260, 1427890860,
1427890980, 1427900160, 1427902860, 1427904480, 1427906700, 1427910900,
1427911920, 1427919360), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(5L,
11L, 14L, 15L, 40L, 59L, 60L, 61L, 77L, 90L, 96L, 103L, 122L,
123L, 135L), class = "data.frame")
I used:
Incidents$DateTime2<- round_date(Incidents$DateTime,unit="5 minutes")
However, this rounds the time to the closest 5 minutes. I want the next 5 minutes instead.
I tried this code below:
Incidents<- Incidents %>% mutate(time = as.numeric(substr(DateTime, 16,16)))
Incidents$time <- ifelse(Incidents$time <6,5- Incidents$time, 10 - Incidents$time)
minut<- minutes(5)
Incidents$DateTime3<- ifelse(Incidents$time <3, Incidents$DateTime2 + minut, Incidents$DateTime2)
However, this gives me numeric values that I could not convert to date time variable.

filter rows based on all previous row data in another column

I have a data table which i would like to filter based on multiple conditions looking at all previous columns. If the New_ID.1 row number is before the same id in the New_ID column, remove the row where New_ID= New_ID.1 from previous row. For example, I would remove the New_ID 581 in row 3 because New_ID.1 is in row 1. However I don't want to remove row six New_ID 551 since row 3 New_ID.551 would be removed first. Essentially, I think i need to loop through and create a new filtered table for each row and repeat process?
orig_df<- structure(list(New_ID = c(557L, 588L, 581L, 580L, 591L, 551L,
300L, 112L), New_ID.1 = c(581L, 591L, 551L, 300L, 112L, 584L,
416L, 115L), distance = c(3339.15537217173, 3432.33715484179,
5268.69104753613, 5296.72042763528, 5271.94917463488, 5258.66546295312,
5286.99982045171, 5277.81914818968), X.x = c(903604.940384474,
819515.728302034, 903663.550206032, 866828.860223065, 819525.350044447,
903720.790105847, 866881.654186025, 819585.173276271), Y.x = c(1027706.41509243,
1026880.34660449, 1024367.77412815, 1023962.99139374, 1023448.02293581,
1019099.39402149, 1018666.53407908, 1018176.41319296), X.y = c(903663.550206032,
819525.350044447, 903720.790105847, 866881.654186025, 819585.173276271,
903801.327345876, 866919.184271939, 819630.672367509), Y.y = c(1024367.77412815,
1023448.02293581, 1019099.39402149, 1018666.53407908, 1018176.41319296,
1013841.34531459, 1013379.66746509, 1012898.79016799), Y_filter = c(3338.64096427278,
3432.32366867992, 5268.38010666054, 5296.45731465891, 5271.60974284587,
5258.04870690871, 5286.86661398865, 5277.62302497006), X_filter = c(58.609821557533,
9.62174241337925, 57.2398998149438, 52.7939629601315, 59.8232318238588,
80.5372400298947, 37.5300859131385, 45.4990912381327), row.number = 1:8), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
End result would retain rows 1,2,4,6 and 8 from original data
output_table<-structure(list(New_ID = c(557L, 588L, 580L, 551L, 112L), New_ID.1 = c(581L,
591L, 300L, 584L, 115L), distance = c(3339.15537217173, 3432.33715484179,
5296.72042763528, 5258.66546295312, 5277.81914818968), X.x = c(903604.940384474,
819515.728302034, 866828.860223065, 903720.790105847, 819585.173276271
), Y.x = c(1027706.41509243, 1026880.34660449, 1023962.99139374,
1019099.39402149, 1018176.41319296), X.y = c(903663.550206032,
819525.350044447, 866881.654186025, 903801.327345876, 819630.672367509
), Y.y = c(1024367.77412815, 1023448.02293581, 1018666.53407908,
1013841.34531459, 1012898.79016799), Y_filter = c(3338.64096427278,
3432.32366867992, 5296.45731465891, 5258.04870690871, 5277.62302497006
), X_filter = c(58.609821557533, 9.62174241337925, 52.7939629601315,
80.5372400298947, 45.4990912381327), row.number = c(1L, 2L, 4L,
6L, 8L)), row.names = c(NA, -5L), class = c("tbl_df", "tbl",
"data.frame"))
Below is a simpler problem that might be of help.
Original data
A|B
C|D
B|E
E|F
Updated data table
A|B
C|D
E|F
I think looping through the rows and saving the ids that you already encountered should be enough?
orig_df <- as.data.frame(orig_df)
included_rows <- rep(FALSE, nrow(orig_df))
seen_ids <- c()
for(i in 1:nrow(orig_df)){
# Skip row if we have seen either ID already
if(orig_df[i, 'New_ID'] %in% seen_ids) next
if(orig_df[i, 'New_ID.1'] %in% seen_ids) next
# If both ids are new, we save them as seen and include the entry
seen_ids <- c(seen_ids, orig_df[i, 'New_ID'] , orig_df[i, 'New_ID.1'] )
included_rows[i] <- TRUE
}
filtered_df <- orig_df[included_rows,]

subset a r dataframe based on conditions

I have a dataframe like this:
From this, I want to delete the rows which have same elements repeated in the multiple columns: assemblyName, qseqid, sseqid. At the same time, I want to keep the rows even if they are repeated but if there is an other row/rows with same assemblyName but have different qseqid and sseqid.
For example, from the above image: batch2_21032019_ENT924_assembly.fasta in the assemblyName has 4 records (rows: 747,748,771,785). If row 771,785 were not there, I would remove rows 747,748 which contains same assemblyName, qseqid and sseqid. But since here, there are row 771 and 785 which have different qseqid and sseqid, I wish to retain all the 4 rows.
But the last few rows from 1422 to 1503, I do not want to keep them because they are repeated in the columns assemblyName, qseqid, sseqid
Basically, what I want is a dataframe with the following output:
How do I achieve this in R? Here is my dput snippet:
structure(list(assemblyName = structure(c(5L, 12L, 12L, 24L,
24L, 42L, 48L, 48L, 48L, 48L, 76L, 76L, 76L, 79L, 79L, 79L), .Label = c("batch1_08032019_ENT1252_assembly.fasta",
"batch1_08032019_ENT1350_assembly.fasta", "batch1_08032019_ENT1368_assembly.fasta",
"batch1_08032019_ENT1382_assembly.fasta", "batch1_08032019_ENT1420_assembly.fasta",
"batch1_08032019_ENT1458_assembly.fasta", "batch1_08032019_ENT1489_assembly.fasta",
"batch14_02082019_ENT1646_assembly.fasta", "batch2_21032019_ENT1079_assembly.fasta",
"batch2_21032019_ENT1192_assembly.fasta", "batch2_21032019_ENT1219_assembly.fasta",
"batch2_21032019_ENT1250_assembly.fasta", "batch2_21032019_ENT1357_assembly.fasta",
"batch2_21032019_ENT1440_assembly.fasta", "batch2_21032019_ENT1669_assembly.fasta",
"batch2_21032019_ENT1758_assembly.fasta", "batch2_21032019_ENT1916_assembly.fasta",
"batch2_21032019_ENT1940_assembly.fasta", "batch2_21032019_ENT1968_assembly.fasta",
"batch2_21032019_ENT256_assembly.fasta", "batch2_21032019_ENT264_assembly.fasta",
"batch2_21032019_ENT267_assembly.fasta", "batch2_21032019_ENT268_assembly.fasta",
"batch2_21032019_ENT285_assembly.fasta", "batch2_21032019_ENT3_assembly.fasta",
"batch2_21032019_ENT310_assembly.fasta", "batch2_21032019_ENT312_assembly.fasta",
"batch2_21032019_ENT337_assembly.fasta", "batch2_21032019_ENT341_assembly.fasta",
"batch2_21032019_ENT358_assembly.fasta", "batch2_21032019_ENT360_assembly.fasta",
"batch2_21032019_ENT378_assembly.fasta", "batch2_21032019_ENT385_assembly.fasta",
"batch2_21032019_ENT421_assembly.fasta", "batch2_21032019_ENT422_assembly.fasta",
"batch2_21032019_ENT423_assembly.fasta", "batch2_21032019_ENT454_assembly.fasta",
"batch2_21032019_ENT465_assembly.fasta", "batch2_21032019_ENT466_assembly.fasta",
"batch2_21032019_ENT473_assembly.fasta", "batch2_21032019_ENT497_assembly.fasta",
"batch2_21032019_ENT5_assembly.fasta", "batch2_21032019_ENT50_assembly.fasta",
"batch2_21032019_ENT595_assembly.fasta", "batch2_21032019_ENT607_assembly.fasta",
"batch2_21032019_ENT708_assembly.fasta", "batch2_21032019_ENT807_assembly.fasta",
"batch2_21032019_ENT924_assembly.fasta", "batch20_11102019_ENT1249_assembly.fasta",
"batch20_11102019_ENT783_assembly.fasta", "batch20_11102019_ENT784_assembly.fasta",
"batch20_11102019_ENT785_assembly.fasta", "batch20_11102019_ENT835_assembly.fasta",
"batch20_11102019_ENT849_assembly.fasta", "batch20_11102019_ENT897_assembly.fasta",
"batch20_11102019_ENT901_assembly.fasta", "batch20_11102019_ENT903_assembly.fasta",
"batch20_11102019_ENT912_assembly.fasta", "batch20_11102019_ENT916_assembly.fasta",
"batch20_11102019_ENT938_assembly.fasta", "batch20_11102019_ENT965_assembly.fasta",
"batch20_11102019_ENT981_assembly.fasta", "batch20_11102019_ENT983_assembly.fasta",
"batch20_11102019_ENT990_assembly.fasta", "batch21x_16102019_ENT1251_assembly.fasta",
"batch21x_16102019_ENT1262_assembly.fasta", "batch21x_16102019_ENT1263_assembly.fasta",
"batch21x_16102019_ENT1266_assembly.fasta", "batch21x_16102019_ENT1267_assembly.fasta",
"batch21x_16102019_ENT1271_assembly.fasta", "batch21x_16102019_ENT1274_assembly.fasta",
"batch21x_16102019_ENT1276_assembly.fasta", "batch21x_16102019_ENT1278_assembly.fasta",
"batch21x_16102019_ENT1279_assembly.fasta", "batch21x_16102019_ENT1280_assembly.fasta",
"batch21x_16102019_ENT1288_assembly.fasta", "batch21x_16102019_ENT1296_assembly.fasta",
"batch21x_16102019_ENT1300_assembly.fasta", "batch21x_16102019_ENT1321_assembly.fasta",
"batch21x_16102019_ENT1322_assembly.fasta", "batch21x_16102019_ENT1325_assembly.fasta",
"batch21x_16102019_ENT1330_assembly.fasta", "batch21x_16102019_ENT1384_assembly.fasta",
"batch21x_16102019_ENT1393_assembly.fasta", "batch21x_16102019_ENT1394_assembly.fasta",
"batch21x_16102019_ENT1396_assembly.fasta", "batch21x_16102019_ENT1465_assembly.fasta",
"batch21x_16102019_ENT1502_assembly.fasta", "batch21x_16102019_ENT1570_assembly.fasta",
"batch21x_16102019_ENT1599_assembly.fasta", "batch21x_16102019_ENT1649_assembly.fasta",
"batch21x_16102019_ENT1676_assembly.fasta", "batch21x_16102019_ENT1681_assembly.fasta",
"batch21x_16102019_ENT1691_assembly.fasta", "batch21x_16102019_ENT1837_assembly.fasta",
"batch21x_16102019_ENT1895_assembly.fasta", "batch21x_16102019_ENT1896_assembly.fasta",
"batch21x_16102019_ENT1929_assembly.fasta", "batch21x_16102019_ENT1941_assembly.fasta",
"batch21x_16102019_ENT209_assembly.fasta", "batch21x_16102019_ENT689_assembly.fasta",
"batch21x_16102019_ENT732_assembly.fasta", "batch21x_16102019_ENT790_assembly.fasta",
"batch22_18102019_ENT1331_assembly.fasta", "batch22_18102019_ENT1336_assembly.fasta",
"batch22_18102019_ENT1337_assembly.fasta", "batch22_18102019_ENT1352_assembly.fasta",
"batch22_18102019_ENT1359_assembly.fasta", "batch22_18102019_ENT1413_assembly.fasta",
"batch22_18102019_ENT1475_assembly.fasta", "batch22_18102019_ENT1515_assembly.fasta",
"batch22_18102019_ENT1559_assembly.fasta", "batch22_18102019_ENT1580_assembly.fasta",
"batch22_18102019_ENT1595_assembly.fasta"), class = "factor"),
qseqid = structure(c(107L, 71L, 89L, 109L, 122L, 119L, 19L,
19L, 69L, 117L, 61L, 61L, 61L, 72L, 72L, 72L), .Label = c("",
"1_length=4775743_depth=1.00x_circular=true", "1_length=4782442_depth=1.00x_circular=true",
"1_length=4798941_depth=1.00x_circular=true", "1_length=4811272_depth=1.00x_circular=true",
"1_length=4854518_depth=1.00x_circular=true", "1_length=4870013_depth=1.00x",
"1_length=4877560_depth=1.00x_circular=true", "1_length=4879405_depth=1.00x_circular=true",
"1_length=4880726_depth=1.00x_circular=true", "1_length=4910657_depth=1.00x_circular=true",
"1_length=4945396_depth=1.00x_circular=true", "1_length=4980803_depth=1.00x_circular=true",
"1_length=4995045_depth=1.00x_circular=true", "1_length=4995093_depth=1.00x_circular=true",
"1_length=5004019_depth=1.00x_circular=true", "1_length=5024487_depth=1.00x_circular=true",
"1_length=5386431_depth=1.00x_circular=true", "1_length=5418220_depth=1.00x_circular=true",
"10_length=167596_depth=0.99x_circular=true", "10_length=41259_depth=2.09x_circular=true",
"19_length=13505_depth=0.90x", "2_length=123974_depth=3.35x_circular=true",
"2_length=174608_depth=2.06x_circular=true", "2_length=177751_depth=2.86x_circular=true",
"2_length=258181_depth=1.64x_circular=true", "2_length=278408_depth=1.57x_circular=true",
"2_length=41183_depth=3.34x_circular=true", "2_length=41190_depth=5.16x_circular=true",
"2_length=41215_depth=3.01x_circular=true", "2_length=41217_depth=2.25x_circular=true",
"2_length=71861_depth=0.77x_circular=true", "2_length=71861_depth=2.89x_circular=true",
"2_length=72968_depth=0.51x_circular=true", "2_length=91069_depth=1.21x_circular=true",
"2_length=91643_depth=2.11x_circular=true", "2_length=92072_depth=0.81x_circular=true",
"20_length=5469_depth=1.62x", "22_length=90789_depth=1.44x_circular=true",
"3_length=112875_depth=0.98x_circular=true", "3_length=118064_depth=3.79x_circular=true",
"3_length=127528_depth=1.73x_circular=true", "3_length=164596_depth=1.02x_circular=true",
"3_length=165091_depth=1.16x_circular=true", "3_length=165095_depth=2.12x_circular=true",
"3_length=165543_depth=0.59x_circular=true", "3_length=174323_depth=1.93x_circular=true",
"3_length=174796_depth=0.74x_circular=true", "3_length=180232_depth=1.88x_circular=true",
"3_length=180817_depth=1.81x_circular=true", "3_length=38610_depth=3.37x_circular=true",
"3_length=41182_depth=3.37x_circular=true", "3_length=41182_depth=4.04x_circular=true",
"3_length=41184_depth=4.98x_circular=true", "3_length=41185_depth=5.84x_circular=true",
"3_length=41186_depth=3.26x_circular=true", "3_length=41232_depth=2.49x_circular=true",
"3_length=50138_depth=1.79x_circular=true", "3_length=58175_depth=0.39x",
"3_length=62334_depth=2.76x_circular=true", "3_length=67915_depth=0.42x",
"3_length=71861_depth=2.39x_circular=true", "3_length=71861_depth=2.99x_circular=true",
"3_length=72145_depth=0.97x_circular=true", "3_length=72168_depth=0.80x_circular=true",
"3_length=731673_depth=1.22x", "3_length=74789_depth=2.02x_circular=true",
"3_length=74794_depth=2.26x_circular=true", "3_length=75214_depth=2.77x_circular=true",
"3_length=79594_depth=1.46x_circular=true", "3_length=88353_depth=2.00x_circular=true",
"3_length=89872_depth=0.49x_circular=true", "3_length=90666_depth=2.61x_circular=true",
"3_length=96544_depth=1.98x_circular=true", "38_length=14280_depth=2.50x",
"39_length=41187_depth=6.10x_circular=true", "4_length=129927_depth=0.88x",
"4_length=161129_depth=0.64x_circular=true", "4_length=165104_depth=0.58x_circular=true",
"4_length=170202_depth=0.80x", "4_length=41182_depth=1.27x_circular=true",
"4_length=41186_depth=4.34x_circular=true", "4_length=41188_depth=2.88x_circular=true",
"4_length=41190_depth=2.44x_circular=true", "4_length=41190_depth=3.46x_circular=true",
"4_length=41215_depth=3.66x_circular=true", "4_length=41224_depth=2.50x_circular=true",
"4_length=46161_depth=2.45x_circular=true", "4_length=51479_depth=1.11x_circular=true",
"4_length=71795_depth=2.16x_circular=true", "4_length=71859_depth=1.18x_circular=true",
"4_length=71861_depth=0.80x_circular=true", "4_length=71861_depth=1.56x_circular=true",
"4_length=71861_depth=1.95x_circular=true", "4_length=71861_depth=3.09x_circular=true",
"4_length=71861_depth=3.28x_circular=true", "4_length=71868_depth=0.67x_circular=true",
"4_length=71875_depth=0.43x_circular=true", "4_length=72162_depth=0.61x_circular=true",
"4_length=72162_depth=1.28x_circular=true", "4_length=73397_depth=1.60x_circular=true",
"4_length=73399_depth=2.01x_circular=true", "4_length=88057_depth=1.72x_circular=true",
"46_length=5494_depth=4.49x", "5_length=110787_depth=5.28x_circular=true",
"5_length=41185_depth=3.00x_circular=true", "5_length=41190_depth=2.13x_circular=true",
"5_length=42336_depth=2.31x_circular=true", "5_length=46161_depth=2.20x_circular=true",
"5_length=51479_depth=1.02x_circular=true", "5_length=51479_depth=2.10x_circular=true",
"5_length=55129_depth=3.86x_circular=true", "5_length=6141_depth=16.45x_circular=true",
"5_length=62044_depth=5.10x", "5_length=6211_depth=4.24x_circular=true",
"5_length=65498_depth=0.98x_circular=true", "5_length=70472_depth=2.31x",
"5_length=71861_depth=1.24x_circular=true", "6_length=41190_depth=4.77x_circular=true",
"6_length=46161_depth=0.86x_circular=true", "6_length=71861_depth=2.24x_circular=true",
"6_length=7604_depth=3.49x_circular=true", "6_length=80977_depth=0.65x_circular=true",
"6_length=95567_depth=1.42x_circular=true", "64_length=6420_depth=2.15x"
), class = "factor"), sseqid = c("NDM-1", "NDM-5", "OXA-181",
"NDM-5", "OXA-181", "NDM-1", "OXA-181", "OXA-181", "NDM-5",
"NDM-5", "OXA-181", "OXA-181", "OXA-181", "OXA-181", "OXA-181",
"OXA-181"), qlen = c(41190L, 88353L, 51479L, 46161L, 7604L,
41190L, 5418220L, 5418220L, 75214L, 70472L, 67915L, 67915L,
67915L, 89872L, 89872L, 89872L), qstart = c(23131L, 14408L,
25135L, 25547L, 5873L, 23131L, 5244180L, 4252066L, 36917L,
20047L, 51138L, 44729L, 38320L, 4678L, 11087L, 88141L), qend = c(23943L,
15220L, 25932L, 26359L, 6670L, 23943L, 5244977L, 4252863L,
37729L, 20859L, 51935L, 45526L, 39117L, 5475L, 11884L, 88938L
)), .Names = c("assemblyName", "qseqid", "sseqid", "qlen",
"qstart", "qend"), row.names = c(78L, 209L, 223L, 389L, 403L,
656L, 747L, 748L, 771L, 785L, 1422L, 1423L, 1424L, 1501L, 1502L,
1503L), class = "data.frame")
We can create a key column combining qseqid and sseqid and then select those assemblyName who either have more one distinct value of key or have only one row in them.
library(dplyr)
df %>%
mutate(key = paste0(qseqid, sseqid)) %>%
group_by(assemblyName) %>%
filter(n_distinct(key) > 1 | n() == 1) %>%
select(-key)
# assemblyName qseqid sseqid qlen qstart end
# <fct> <fct> <chr> <int> <int> <int>
# 1 batch1_08032019_ENT1420_assembly.fasta 5_length=41190_depth=2.13x_circular=true NDM-1 41190 23131 23943
# 2 batch2_21032019_ENT1250_assembly.fasta 3_length=88353_depth=2.00x_circular=true NDM-5 88353 14408 15220
# 3 batch2_21032019_ENT1250_assembly.fasta 4_length=51479_depth=1.11x_circular=true OXA-181 51479 25135 25932
# 4 batch2_21032019_ENT285_assembly.fasta 5_length=46161_depth=2.20x_circular=true NDM-5 46161 25547 26359
# 5 batch2_21032019_ENT285_assembly.fasta 6_length=7604_depth=3.49x_circular=true OXA-181 7604 5873 6670
# 6 batch2_21032019_ENT5_assembly.fasta 6_length=41190_depth=4.77x_circular=true NDM-1 41190 23131 23943
# 7 batch2_21032019_ENT924_assembly.fasta 1_length=5418220_depth=1.00x_circular=true OXA-181 5418220 5244180 5244977
# 8 batch2_21032019_ENT924_assembly.fasta 1_length=5418220_depth=1.00x_circular=true OXA-181 5418220 4252066 4252863
# 9 batch2_21032019_ENT924_assembly.fasta 3_length=75214_depth=2.77x_circular=true NDM-5 75214 36917 37729
#10 batch2_21032019_ENT924_assembly.fasta 5_length=70472_depth=2.31x NDM-5 70472 20047 20859
We can also use str_c
library(dplyr)
library(stringr)
library(dplyr)
df %>%
mutate(key = str_c(qseqid, sseqid)) %>%
group_by(assemblyName) %>%
slice(which(n_distinct(key) > 1 | n() == 1)) %>%
select(-key)

extracting number of local extremum point in a time-series data in R

I have 3000 people in a dataset, each of them have Heart rate as a time-series.
the time series interval is of 24 hours and in each hour there is max of 6 values, makes it max total of 24*6=144 values for each person (there is usually less).
for every person I want to extract the number of "peeks" he has in the time interval.
I have checked and didn't find any function the calculate or give back more then the "absolute min/max" of the interval.
I have add an example of what I need, in the below graph although there is one absolute min and max, I need the whole four points (2 max and 2 min).
I really don't know how to extract this expect of creating my one function and using a lot of for-loops which isn't good.
can you help?
sample of the time-series:
dput(hr[1:20,])
structure(list(ID = c(5838L, 5838L, 5838L, 5838L, 5838L, 5838L,
5838L, 5838L, 5838L, 5838L, 5838L, 5838L, 5838L, 5838L, 5838L,
5983L, 5983L, 5983L, 5983L, 5983L), Heart.Rate = c(103L, 109L,
109L, 109L, 111L, 111L, 120L, 122L, 125L, 62L, 73L, 84L, 92L,
97L, 98L, 101L, 105L, 105L, 106L, 106L), Time = structure(c(1077080040,
1077083640, 1077084000, 1077084240, 1077083040, 1077085440, 1077082440,
1077081240, 1077081840, 1077086640, 1077087240, 1077084900, 1077080700,
1077080400, 1077086040, 1088496000, 1088494680, 1088495280, 1088498280,
1088504880), class = c("POSIXct", "POSIXt"), tzone = "UTC")), .Names = c("ID",
"Heart.Rate", "Time"), row.names = c(NA, 20L), class = "data.frame")
What you want is finding inflation points (local extremums)
You may want to check this package. I used it occasionally and it was really useful.
Also you may want to check this post;
As it says there, you want to find the point that the sign of change in y (heart rate) changes.
In reference to there, this should work for you:
infl <- c(FALSE, diff(diff(Heart.rate)>0)!=0)

Error when making a sparse matrix

I am facing a problem I do not understand. It's a follow-up on answers suggested here and here
I have two identically structured datasets. One I created as a reproducible example for which the code works, and one which is real for which the code does not work. After staring at it for hours I cannot find what is causing the error.
The following gives an example that works
df <- data.table(cbind(rep(seq(1,25), each = 4 )), cbind(rep(seq(1,40), length.out = 100)))
colnames(df) <- c("a", "b") #ignore warning
setkey(df, a, b)
This is just to create a reproducible example. When I apply the - slightly adjusted - code suggested in the mentioned SO articles I get what I am looking for: a sparse matrix that indicates when two elements in column b occur together for values of column a
library(Matrix)
s <- sparseMatrix(
df$a,
df$b,
dimnames = list(
unique(df$a),unique(df$b)), x = 1)
v <- t(s) %*% s
Now I am doing - in my eyes - exactly the same on my real dataset which is much longer.
A sample dput below looks like this
test <- dput(dk[1:50,])
structure(list(pid = c(204L, 204L, 207L, 254L, 254L, 258L, 258L,
258L, 258L, 258L, 265L, 265L, 269L, 269L, 269L, 269L, 1520L,
1520L, 1520L, 1520L, 1532L, 1532L, 1534L, 1534L, 1534L, 1534L,
1539L, 1539L, 1543L, 1543L, 1546L, 1546L, 1546L, 1546L, 1546L,
1546L, 1546L, 1549L, 1549L, 1549L, 1559L, 1559L, 1559L, 1559L,
1559L, 1559L, 1559L, 1561L, 1561L, 1561L), cid = c(11023L, 11787L,
14232L, 14470L, 14480L, 1290L, 1637L, 4452L, 13964L, 14590L,
17814L, 23453L, 6658L, 10952L, 17259L, 27549L, 11034L, 22748L,
23345L, 23347L, 10487L, 11162L, 15570L, 15629L, 17983L, 17999L,
17531L, 22497L, 14425L, 14521L, 11495L, 24948L, 24962L, 24969L,
24972L, 24973L, 30627L, 17886L, 18428L, 23972L, 13890L, 13936L,
14432L, 21230L, 21271L, 21384L, 21437L, 341L, 354L, 6302L)), .Names = c("pid",
"cid"), sorted = c("pid", "cid"), class = c("data.table", "data.frame"
), row.names = c(NA, -50L), .internal.selfref = <pointer: 0x0000000000100788>)
Then when running the same formula, I get an error
s <- sparseMatrix(test$pid,test$cid,dimnames = list(unique(test$pid), unique(test$cid)),x = 1)
The Error (which occurs in the test dataset as well) reads as follows:
Error in validObject(r) :
invalid class “dgTMatrix” object: length(Dimnames[[1]])' must match Dim[1]
The problem disappears when I remove the dimnames but I really need these dimnames to make sense of the results. I'm sure I'm missing out on something obvious. Can someone please tell me what it is ?
We can convert the 'pid', 'cid' columns to factor and coerce back to numeric or use match with unique values of each column to get the row/column index and this should work in creating sparseMatrix.
test1 <- test[, lapply(.SD, function(x)
as.numeric(factor(x, levels=unique(x))))]
Or we use match
test1 <- test[, lapply(.SD, function(x) match(x, unique(x)))]
s1 <- sparseMatrix(test1$pid,test1$cid,dimnames = list(unique(test$pid),
unique(test$cid)),x = 1)
dim(s1)
#[1] 15 50
s1[1:3, 1:3]
#3 x 3 sparse Matrix of class "dgCMatrix"
# 11023 11787 14232
#204 1 1 .
#207 . . 1
#254 . . .
head(test)
# pid cid
#1: 204 11023
#2: 204 11787
#3: 207 14232
#4: 254 14470
#5: 254 14480
#6: 258 1290
EDIT:
If we want this for the full row/column index specified in 'test', we need to make the dimnames as the same length as the max of 'pid', 'cid'
rnm <- seq(max(test$pid))
cnm <- seq(max(test$cid))
s2 <- sparseMatrix(test$pid, test$cid, dimnames=list(rnm, cnm))
dim(s2)
#[1] 1561 30627
s2[1:3, 1:3]
#3 x 3 sparse Matrix of class "ngCMatrix"
# 1 2 3
#1 . . .
#2 . . .
#3 . . .

Resources