Delete all columns in which all rows have a value below 10000 - r

I have a big dataset (2300 obs x 9700 var) in which I only want to have all variables (columns) in which all samples (rows) have a values above 10000.
I tried a lot of codes, for example:
subset <- df[,apply(df,1,function(z) !all(z<10000))]
subset<- df[,df> 10000]
But they are not working. RowMeans doesn't work since maybe the mean is < 10000 but some individual row may be >10000.
Anybody some tips how to tackle this?

Try this to filter out all rows in which all values are >10000:
df[rowSums(df>10000)==ncol(df),]
# a b c
#4 14139 127746 10911
#7 11582 73952 10821
To filter out all columns in which all values are >10000:
df[,colSums(df>10000)==nrow(df)]
# [1] 40004 105808 70261 127746 60177 134365 73952 86584 87551 67781
data
df <- structure(list(a = c(13773, 8680, 4854, 14139, 3106, 3044, 11582,
9475, 728, 4666), b = c(40004, 105808, 70261, 127746, 60177,
134365, 73952, 86584, 87551, 67781), c = c(1913, 2092, 14468,
10911, 14414, 8015, 10821, 12636, 12320, 1266)), .Names = c("a",
"b", "c"), row.names = c(NA, -10L), class = "data.frame")
# a b c
# 1 13773 40004 1913
# 2 8680 105808 2092
# 3 4854 70261 14468
# 4 14139 127746 10911
# 5 3106 60177 14414
# 6 3044 134365 8015
# 7 11582 73952 10821
# 8 9475 86584 12636
# 9 728 87551 12320
# 10 4666 67781 1266

Related

Basic use of approx() for lookup table with linear interpolation

I have a vector, my_points and a dataframe that describes the almost linear relationship between points and values.
How do I obtain the vector, my_values, from the relationship described in the dataframe and my_points using linear interpolation?
Assume the relationship beyond the last point in the data frame remains linear.
my_points <- c(4400, 8800, 13200, 37600, 42000, 46400, 50800, 55200, 59600,
64000, 68400, 72800, 77200, 81600, 86000, 90400, 94800, 99200,
103600, 108000, 112400, 116800, 121200, 125600)
df <- structure(list(points = c(3000, 4500, 7500, 11000, 14500, 21500,
43000, 71500), values = c(20, 30, 50, 75, 100, 150, 300, 500),
points_per_value = c(150, 150, 150, 146.666666666667, 145,
143.333333333333, 143.333333333333, 143)), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
You said "interpolation", in which case you can get:
cbind(
data.frame(my_points),
lapply(df[-1], function(z) approx(df$points, z, xout = my_points)$y)
)
# my_points values points_per_value
# 1 4400 29.33333 150.0000
# 2 8800 59.28571 148.7619
# 3 13200 90.71429 145.6190
# 4 37600 262.32558 143.3333
# 5 42000 293.02326 143.3333
# 6 46400 323.85965 143.2936
# 7 50800 354.73684 143.2421
# 8 55200 385.61404 143.1906
# 9 59600 416.49123 143.1392
# 10 64000 447.36842 143.0877
# 11 68400 478.24561 143.0363
# 12 72800 NA NA
# 13 77200 NA NA
# 14 81600 NA NA
# 15 86000 NA NA
# 16 90400 NA NA
# 17 94800 NA NA
# 18 99200 NA NA
# 19 103600 NA NA
# 20 108000 NA NA
# 21 112400 NA NA
# 22 116800 NA NA
# 23 121200 NA NA
# 24 125600 NA NA
But you also said "beyond the last point", suggesting you want "extrapolation":
cbind(
data.frame(my_points), lapply(df[-1], function(z)
Hmisc::approxExtrap(df$points, z, xout = my_points)$y)
)
# my_points values points_per_value
# 1 4400 29.33333 150.0000
# 2 8800 59.28571 148.7619
# 3 13200 90.71429 145.6190
# 4 37600 262.32558 143.3333
# 5 42000 293.02326 143.3333
# 6 46400 323.85965 143.2936
# 7 50800 354.73684 143.2421
# 8 55200 385.61404 143.1906
# 9 59600 416.49123 143.1392
# 10 64000 447.36842 143.0877
# 11 68400 478.24561 143.0363
# 12 72800 509.12281 142.9848
# 13 77200 540.00000 142.9333
# 14 81600 570.87719 142.8819
# 15 86000 601.75439 142.8304
# 16 90400 632.63158 142.7789
# 17 94800 663.50877 142.7275
# 18 99200 694.38596 142.6760
# 19 103600 725.26316 142.6246
# 20 108000 756.14035 142.5731
# 21 112400 787.01754 142.5216
# 22 116800 817.89474 142.4702
# 23 121200 848.77193 142.4187
# 24 125600 879.64912 142.3673
If all you need is the vector of one of these columns, then
Hmisc::approxExtrap(df$points, df$my_values, xout = my_points)$y

Function only evaluates first row of R data table

I have a simple data table apple that has numerous instances of numbers shortened as 40.08B, 40.08M, 400.08K, etc. I need to remove these letters and replace them with the appropriate number of zeros (i.e. 400.08K becomes 400080), so I wrote the following code:
apple2 <- dplyr::case_when(
stringr::str_detect(apple[,-1], 'B') ~ readr::parse_number(as.character(apple[,-1]), na = c("", "NA")) * 1e9,
stringr::str_detect(apple[,-1], 'M') ~ readr::parse_number(as.character(apple[,-1]), na = c("", "NA")) * 1e6,
stringr::str_detect(apple[,-1], 'K') ~ readr::parse_number(as.character(apple[,-1]), na = c("", "NA")) * 1e3,
TRUE ~ parse_number(as.character(apple[,-1]), na = c("", "NA"), trim_ws = TRUE)
)
The code works as expected in finding and converting the strings into appropriate numbers, but it only runs on the first row of the data table. In addition, it removes the headers. The error message is the following:
argument is not an atomic vector; coercingargument is not an atomic vector; coercingargument is not an atomic vector; coercing[1]
I've tried figuring this out for hours but to no avail - what am I doing wrong here? Thank you!
You are using case_when in a somewhat unorthodox way:
## some data:
d <- cbind.data.frame(
id = LETTERS,
matrix(
paste0(
ceiling(runif(26*5, max=999)),
sample( c("","B","K","M"), size=26*5, replace=T )
), nrow=26
)
)
library(stringr)
library(readr)
d %>% mutate( across( -1,
~ case_when(
str_detect(., 'B') ~ parse_number(as.character(.), na = c("", "NA")) * 1e9,
str_detect(., 'M') ~ parse_number(as.character(.), na = c("", "NA")) * 1e6,
str_detect(., 'K') ~ parse_number(as.character(.), na = c("", "NA")) * 1e3,
TRUE ~ parse_number(as.character(.), na = c("", "NA"), trim_ws = TRUE)
)
))
Input data:
id 1 2 3 4 5
1 A 834 27B 250 881B 988
2 B 313M 506B 309 413 141K
3 C 197 77 824 161B 43K
4 D 845K 172K 745B 922M 145M
5 E 168M 959M 990B 250K 893
6 F 430 687K 368M 10M 824M
7 G 940B 403B 655M 818 777K
8 H 281 833K 86B 849B 16K
9 I 485B 508B 349M 643M 926M
10 J 235B 10B 206M 505K 347M
11 K 897B 727M 405K 987B 674M
12 L 588B 40M 860M 58 934B
13 M 727K 375 188M 728K 201B
14 N 280K 442M 43K 400 445
15 O 988B 388M 530B 702M 240B
16 P 177M 782 410K 254K 758K
17 Q 706K 262 520B 104K 34
18 R 390B 99K 677K 965 635M
19 S 819 115M 920M 580M 295K
20 T 573M 901K 360 7K 88B
21 U 333B 593M 504B 992 241B
22 V 674 192M 841B 644B 659
23 W 524M 581M 692M 41 133
24 X 626K 686M 712K 756M 136B
25 Y 295 468 932M 486B 35K
26 Z 526K 798K 229K 958B 700B
Output:
id 1 2 3 4 5
1 A 8.34e+02 2.70e+10 2.50e+02 8.81e+11 9.88e+02
2 B 3.13e+08 5.06e+11 3.09e+02 4.13e+02 1.41e+05
3 C 1.97e+02 7.70e+01 8.24e+02 1.61e+11 4.30e+04
4 D 8.45e+05 1.72e+05 7.45e+11 9.22e+08 1.45e+08
5 E 1.68e+08 9.59e+08 9.90e+11 2.50e+05 8.93e+02
6 F 4.30e+02 6.87e+05 3.68e+08 1.00e+07 8.24e+08
7 G 9.40e+11 4.03e+11 6.55e+08 8.18e+02 7.77e+05
8 H 2.81e+02 8.33e+05 8.60e+10 8.49e+11 1.60e+04
9 I 4.85e+11 5.08e+11 3.49e+08 6.43e+08 9.26e+08
10 J 2.35e+11 1.00e+10 2.06e+08 5.05e+05 3.47e+08
11 K 8.97e+11 7.27e+08 4.05e+05 9.87e+11 6.74e+08
12 L 5.88e+11 4.00e+07 8.60e+08 5.80e+01 9.34e+11
13 M 7.27e+05 3.75e+02 1.88e+08 7.28e+05 2.01e+11
14 N 2.80e+05 4.42e+08 4.30e+04 4.00e+02 4.45e+02
15 O 9.88e+11 3.88e+08 5.30e+11 7.02e+08 2.40e+11
16 P 1.77e+08 7.82e+02 4.10e+05 2.54e+05 7.58e+05
17 Q 7.06e+05 2.62e+02 5.20e+11 1.04e+05 3.40e+01
18 R 3.90e+11 9.90e+04 6.77e+05 9.65e+02 6.35e+08
19 S 8.19e+02 1.15e+08 9.20e+08 5.80e+08 2.95e+05
20 T 5.73e+08 9.01e+05 3.60e+02 7.00e+03 8.80e+10
21 U 3.33e+11 5.93e+08 5.04e+11 9.92e+02 2.41e+11
22 V 6.74e+02 1.92e+08 8.41e+11 6.44e+11 6.59e+02
23 W 5.24e+08 5.81e+08 6.92e+08 4.10e+01 1.33e+02
24 X 6.26e+05 6.86e+08 7.12e+05 7.56e+08 1.36e+11
25 Y 2.95e+02 4.68e+02 9.32e+08 4.86e+11 3.50e+04
26 Z 5.26e+05 7.98e+05 2.29e+05 9.58e+11 7.00e+11
See also other ways to convert the human readable byte number to a number, eg this or perhaps this
We could make use of str_replace_all instead of multiple str_detect. Match and replace the 'B', 'M', 'K' substring in the column with a named vector in str_replace_all, then separate the column, and do the multiplication based on the separated columns
library(stringr)
library(dplyr)
library(tidyr)
apple %>%
mutate(col1 = str_replace_all(col1, setNames(c(' 1e9', ' 1e6', ' 1e3'),
c('B', 'M', 'K')))) %>%
separate(col1, into = c('col1', 'col2'), convert = TRUE) %>%
transmute(col1 = col1 * col2)
-output
# col1
#1 4.0e+10
#2 2.0e+08
#3 2.0e+06
#4 4.0e+05
#5 3.6e+10
data
apple <- structure(list(col1 = c("40B", "200M", "2M", "400K", "36B")),
class = "data.frame", row.names = c(NA,
-5L))

replace data.frame using specified data set

I want to replace data.frame using specified data set
> test_data
support count
1 0.01235235 663
2 0.01373104 737
3 0.01393598 748
4 0.01265045 679
5 0.01548236 831
6 0.01565004 840
> replace_support
2 3 4 6
-0.008884196 -0.007991622 -0.011675116 -0.013086012
names of replace_support corresponds with row name of test_data
my expect is replace the column support
support count
1 0.01235235 663
2 -0.008884196 737
3 -0.007991622 748
4 -0.011675116 679
5 0.01548236 831
6 -0.013086012 840
hare are the example data
test_data <- structure(list(support = c(0.0123523493684093, 0.0137310429630734,
0.0139359839028207, 0.0126504452807691, 0.0154823564481872, 0.0156500353988896
), count = c(663, 737, 748, 679, 831, 840)), .Names = c("support",
"count"), row.names = c(NA, 6L), class = "data.frame")
replace_support <- structure(c(-0.00888419577036815, -0.00799162193023339, -0.0116751160488589,
-0.0130860121134779), .Names = c("2", "3", "4", "6"))
You can use the replace function:
indexes <- as.integer(names(replace_support))
test_data$support <- replace(test_data$support,indexes,replace_support)
test_data
support count
1 0.012352349 663
2 -0.008884196 737
3 -0.007991622 748
4 -0.011675116 679
5 0.015482356 831
6 -0.013086012 840
If the names of replace_support don't match the appropriate indexes, you can supply them manually.
How about:
test_data$support[as.integer(names(replace_support))] <- replace_support
test_data
#> support count
#> 1 0.012352349 663
#> 2 -0.008884196 737
#> 3 -0.007991622 748
#> 4 -0.011675116 679
#> 5 0.015482356 831
#> 6 -0.013086012 840

Ragged rowSums in R

I am trying to do a rowSum for the actuals columns. However, I would like to include the values up to the UpTo date for certain observations. Here is the data frame:
dat <- structure(list(Company = c("ABC", "DEF", "XYZ"), UpTo = c(NA,
"Q2", "Q3"), Actual.Q1 = c(100L, 80L, 100L), Actual.Q2 = c(50L,
75L, 50L), Forecast.Q3 = c(80L, 50L, 80L), Forecast.Q4 = c(90L,
80L, 100L)), .Names = c("Company", "UpTo", "Actual.Q1", "Actual.Q2",
"Forecast.Q3", "Forecast.Q4"), class = "data.frame", row.names = c("1",
"2", "3"))
Company UpTo Actual.Q1 Actual.Q2 Forecast.Q3 Forecast.Q4
1 ABC NA 100 50 80 90
2 DEF Q2 80 75 50 80
3 XYZ Q3 100 50 80 100
For company ABC, since there is no UpTo date, it will just be Actual.Q1 + Actual.Q2, which is 150.
For company DEF, since the UpTo date is Q2, it will be Actual.Q1 + Actual.Q2, which is 155.
For company XYZ, since the UpTo date is Q3, it will be Actual.Q1 + Actual.Q2 + Forecast.Q3, which is 230.
The resulting data frame would look like this:
Company UpTo Actual.Q1 Actual.Q2 Forecast.Q3 Forecast.Q4 SumRecent
1 ABC NA 100 50 80 90 150
2 DEF Q2 80 75 50 80 155
3 XYZ Q3 100 50 80 100 230
I have tried to use the rowSums function. However, it does not take into effect the variable UpTo. Any help is appreciated. Thanks!
Here is a possibility:
df$SumRecent <- sapply(1:nrow(df), function(x) {sum(df[x,3:ifelse(is.na(grep(df[x,2], colnames(df))[1]), 4, grep(df[x,2], colnames(df))[1])])})
# Company UpTo Actual.Q1 Actual.Q2 Forecast.Q3 Forecast.Q4 SumRecent
# 1 ABC <NA> 100 50 80 90 150
# 2 DEF Q2 80 75 50 80 155
# 3 XYZ Q3 100 50 80 100 230
We are looking with the use of grep for a match of the value in the column UpTo (df[x,2]) in the column names of df (colnames(df)). If we find it we get the sum, if we don't find it we just sum the values in columns 3 and 4.
We can use binary weighted row sums.
UpTo <- as.character(dat$UpTo) ## in case you have factor column
UpTo[is.na(UpTo)] <- "Q2" ## replace `NA` to "Q2"
w <- outer(as.integer(substr(UpTo, 2, 2)), 1:4, ">=")
# [,1] [,2] [,3] [,4]
#[1,] TRUE TRUE FALSE FALSE
#[2,] TRUE TRUE FALSE FALSE
#[3,] TRUE TRUE TRUE FALSE
We have a logical matrix. But it does not affect arithmetic computation as TRUE is 1 and FALSE is 0. Then we do weighted row sums:
X <- data.matrix(dat[3:6])
dat$SumRecent <- rowSums(X * w)
# Company UpTo Actual.Q1 Actual.Q2 Forecast.Q3 Forecast.Q4 SumRecent
#1 ABC <NA> 100 50 80 90 150
#2 DEF Q2 80 75 50 80 155
#3 XYZ Q3 100 50 80 100 230
The advantage of this approach is its speed / efficiency, as it is fully vectorized. This method is super fast. You can refer to the benchmark result in Fast way to create a binary matrix with known number of 1 each row in R.
This should also work:
df$UpTo <- as.character(df$UpTo)
df$SumRecent <- apply(df, 1, function(x) ifelse(is.na(x[2]), sum(as.integer(x[3:4])),
sum(as.integer(x[3:(grep(x[2], names(df)))]))))
df
# Company UpTo Actual.Q1 Actual.Q2 Forecast.Q3 Forecast.Q4 SumRecent
#1 ABC <NA> 100 50 80 90 150
#2 DEF Q2 80 75 50 80 155
#3 XYZ Q3 100 50 80 100 230
Another approach using data table:
require(data.table)
dat <- fread('Company UpTo Actual.Q1 Actual.Q2 Forecast.Q3 Forecast.Q4
ABC NA 100 50 80 90
DEF Q2 80 75 50 80
XYZ Q3 100 50 80 100')
dat[, SumRecent:= ifelse(is.na(UpTo), Actual.Q1 + Actual.Q2,
sum(.SD[, grepl(paste0("Q[1-", substring(UpTo, 2), "]$"), names(.SD)), with = F]) ), by = Company]

Reshape matrix into a list of lists

I have a list as follows:
id | value
----------
4 600
4 899
7 19
13 4930
13 300
: :
There are multiple ID repeats, and each one has a unique value. I want to turn this into something as follows:
id | list
----------
4 c(600, 899)
7 c(19)
13 c(4930, 300)
: :
Is there a vectorized method of accomplishing this?
EDIT: Extending the first question, is there a simple way to do the same thing for a generic MxN matrix? I.e., turning this:
id | value1 value2
-------------------
4 600 a
4 899 b
7 19 d
13 4930 e
13 300 a
: : :
into this:
id | list
----------
4 list(c(600, 899),c('a','b'))
7 list(c(19),c('b'))
13 list(c(4930, 300),c('e','a'))
: :
Thanks!
You could also use tapply if you want to stick with base functions:
tapply(dat$value,dat$id,c)
$`4`
[1] 600 899
$`7`
[1] 19
$`13`
[1] 4930 300
Edit:
For your edited problem, I would go with split and lapply:
x <- lapply(split(dat[2:3],dat$id),c,use.names=F)
dput(x)
structure(list(`4` = list(c(600, 899), c("a", "b")), `7` = list(
19, "d"), `13` = list(c(4930, 300), c("e", "a"))), .Names = c("4", "7", "13"))
The functions in package plyr should be of help here.
In the following example I assume your data is in the form of a data.frame - even if it really is a list, as you say, it should be straight-forward to convert to a data.frame:
dat <- data.frame(
id = c(4, 4, 7, 13, 13),
value = c(600, 899, 19, 4930, 300)
)
library(plyr)
dlply(dat, .(id), function(x)x$value)
The result is a list as you specified:
$`4`
[1] 600 899
$`7`
[1] 19
$`13`
[1] 4930 300
attr(,"split_type")
[1] "data.frame"
attr(,"split_labels")
id
1 4
2 7
3 13
I'd just split() the data:
d <- read.table(text = "id value
4 600
4 899
7 19
13 4930
13 300", header=T)
split(d$value, d$id)
$`4`
[1] 600 899
$`7`
[1] 19
$`13`
[1] 4930 300

Resources