Spliting string with a specific pattern - r

I have a table with a character field that can have either of these pattern :
input
97 # a single number
210 foo # a number and a word
87 bar 89 # a number, a word, a number
21 23 # two numbers
123 2 fizzbuzz # two number, a word
12 fizz 34 buzz # a number, a word, a number, a word
I'd like to split each line up to 4 parts, containing respectively the first number, the first word if it exists, the second number if it exists, and the second word if it exists. So my example would give :
input nb_1 word_1 nb_2 word_2
97 97
210 foo 210 foo
87 bar 89 87 bar 89
21 23 21 23
123 2 fizzbuzz 123 2 fizzbuzz
12 fizz 34 buzz 12 fizz 34 buzz
Please note the case of two number, a word (the example before the last one) : it has nothing in word_1 as there is no word between the two numbers.
Is there a way to do this without a tedious if / if / else structure ?
If it can help, all the words belong to a list of 10 specific words. Also, if there are two words, they can be the same or different. Also, the numbers can be one, two or three digits long.
Thanks

Here is an idea using gsub and cSplit from splitstackshape package,
library(splitstackshape)
df$num <- gsub('\\D', ' ', df$V1)
df$wrds <- gsub('\\d', ' ', df$V1)
newdf <- cSplit(df, 2:3, ' ', 'wide')
newdf
# V1 num_1 num_2 wrds_1 wrds_2
#1: 97 97 NA NA NA
#2: 210 foo 210 NA foo NA
#3: 87 bar 89 87 89 bar NA
#4: 21 23 21 23 NA NA
#5: 123 2 fizzbuzz 123 2 fizzbuzz NA
#6: 12 fizz 34 buzz 12 34 fizz buzz
The only problem is row 5, which can be fixed as follows,
newdf$wrds_1 <- as.character(newdf$wrds_1)
newdf$wrds_2 <- as.character(newdf$wrds_2)
newdf$wrds_2[grep('[0-9]+\\s+[0-9]+\\s+[A-Za-z]', newdf$V1)] <- newdf$wrds_1[grep('[0-9]+\\s+[0-9]+\\s+[A-Za-z]', newdf$V1)]
newdf$wrds_1[grep('[0-9]+\\s+[0-9]+\\s+[A-Za-z]', newdf$V1)] <- NA
which finally gives,
newdf
# V1 num_1 num_2 wrds_1 wrds_2
#1: 97 97 NA NA NA
#2: 210 foo 210 NA foo NA
#3: 87 bar 89 87 89 bar NA
#4: 21 23 21 23 NA NA
#5: 123 2 fizzbuzz 123 2 NA fizzbuzz
#6: 12 fizz 34 buzz 12 34 fizz buzz
DATA
dput(df)
structure(list(V1 = c("97", " 210 foo", " 87 bar 89",
" 21 23", " 123 2 fizzbuzz",
" 12 fizz 34 buzz")), .Names = "V1", row.names = c(NA,
-6L), class = "data.frame")

Tried in a different way...
library(splitstackshape)
abc <- data.frame(a=c(97,"210 foo","87 bar 89","21 23","123 2 fizzbuzz","12 fizz 34 buzz"))
abc1 <- data.frame(cSplit(abc, "a", " ", stripWhite = FALSE))
abc <- cbind(abc,abc1)
names(abc) <- c("input","nb_1", "word_1", "nb_2","word_2")
abc[,1:5] <-apply(abc[,1:5] , 2, as.character)
for(i in 1:nrow(abc)){
abc$word_2[i] <- replace(abc$word_2[i] , is.na(abc$word_2[i]),abc$nb_2[grepl("[a-z]",abc$nb_2[i])][i])
abc$nb_2[i] <- replace(abc$nb_2[i] , is.na(abc$nb_2[i])|grepl("[a-z]",abc$nb_2[i]),abc$word_1[grepl("[0-9]",abc$word_1[i])][i])
}
abc$word_1 <- ifelse(grepl("[0-9]",abc$word_1),NA,abc$word_1)
abc[is.na(abc)] <- ""
print(abc)
input nb_1 word_1 nb_2 word_2
1 97 97
2 210 foo 210 foo
3 87 bar 89 87 bar 89
4 21 23 21 23
5 123 2 fizzbuzz 123 2 fizzbuzz
6 12 fizz 34 buzz 12 fizz 34 buzz

This is a hacky function to do it... although you might have other cases that would break it.
f <- function(x){
string2 <- strsplit(x, " ")[[1]]
if (length(string2) < 2)
return(c(string2, NA, NA, NA))
arenums <- grepl("\\d", string2)
c(string2[which(arenums)[1]],
if (arenums[2]) NA else string2[which(!arenums)[1]],
string2[which(arenums)[2]],
if (arenums[2]) string2[which(!arenums)[1]] else string2[which(!arenums)[2]])
}
> f("97")
[1] "97" NA NA NA
> f("210 foo")
[1] "210" "foo" NA NA
> f("87 bar 89")
[1] "87" "bar" "89" NA
> f("21 23")
[1] "21" NA "23" NA
> f("123 2 fizzbuzz")
[1] "123" NA "2" "fizzbuzz"
> f("12 fizz 34 buzz")
[1] "12" "fizz" "34" "buzz"

Related

sorting row values in dataframe by column values

I have difficulty sorting row values by particular column.
The values have different order, for example,
METHOD VAL1 VAL2 VAL3
1-A 10 2 15
10-B 11 5 15
11-c 23 45 65
2-F 4 65 67
3-T 4 56 11
and I need like this,
METHOD VAL1 VAL2 VAL3
1-A 10 2 15
2-F 4 65 67
3-T 4 56 11
10-B 11 5 15
11-c 23 45 65
The sorting order is based on METHOD column. I've tried to arrange it in many ways but without success.
I have solved this issue but there is an another issue on the same code. Individually, the following code works but when applied to function - creates an issue.
a1 <- a1[order(as.numeric(gsub("-.*", "", a1$varname))),]
My function as follows,
t1<- doTable1(AE_subset$Disp_code,AE_subset$FY,"DisposalMethod",thresh = 0.02,testvar = AE_subset$Attendance,fun="sum")
doTable1<- function(var1,var2,varname,testvar=NULL,fun=NULL,inc=TRUE,thresh=0.02) {
if (is.null(fun)) {
a1<- as.data.frame.matrix(table(var1,var2))
} else {
a1<- as.data.frame.matrix(tapply(testvar,list(var1,var2),FUN=fun,na.rm=TRUE))
}
a1<- rownames_to_column(a1,var=varname)
a1$FY3PR<- a1$FY3*proRata
if (!is.null(fun))
if (fun=="mean")
a1$FY3PR<- a1$FY3
a1 <- a1[order(as.numeric(gsub("-.*", "", a1$varname))),] # dataframe is not updating here
a1 <- a1 %>% replace(., is.na(.), 0)
a1 <- rbind(a1,c("Total",as.numeric(colSums(a1[,2:4]))))
return(a1)
}
Simple it returns NULL data frame.
Can anyone identify why this function fails when it comes to order() command?
You can use gsub to split the numbers from the characters and order them:
df[order(as.numeric(gsub("-.*", "", df$METHOD))),]
METHOD VAL1 VAL2 VAL3
1 1-A 10 2 15
4 2-F 4 65 67
5 3-T 4 56 11
2 10-B 11 5 15
3 11-c 23 45 65
With dplyr you can do:
library(dplyr)
dat %>% # we create a new column based on METHOD
mutate(met_num =as.numeric(gsub("\\D", "", METHOD)) ) %>% # gets only the number part
arrange(met_num) %>% # we arrange just by the number part of METHOD
select(-met_num) # removes that new column
METHOD VAL1 VAL2 VAL3
1 1-A 10 2 15
2 2-F 4 65 67
3 3-T 4 56 11
4 10-B 11 5 15
5 11-c 23 45 65
Data used:
tt <- "METHOD VAL1 VAL2 VAL3
1-A 10 2 15
10-B 11 5 15
11-c 23 45 65
2-F 4 65 67
3-T 4 56 11"
dat <- read.table(text = tt, header = T)

r count cells with missing values across each row [duplicate]

This question already has answers here:
Count NAs per row in dataframe [duplicate]
(2 answers)
Closed 6 years ago.
I have a dataframe as shown below
Id Date Col1 Col2 Col3 Col4
30 2012-03-31 A42.2 20.46 NA
36 1996-11-15 NA V73 55
96 2010-02-07 X48 Z16 13
40 2010-03-18 AD14 20.12 36
69 2012-02-21 22.45
11 2013-07-03 81 V017 TCG11
22 2001-06-01 67
83 2005-03-16 80.45 V22.15 46.52 X29.11
92 2012-02-12
34 2014-03-10 82.12 N72.22 V45.44
I am trying to count the number of NA or Empty cells across each row and the final expected output is as follows
Id Date Col1 Col2 Col3 Col4 MissCount
30 2012-03-31 A42.2 20.46 NA 2
36 1996-11-15 NA V73 55 2
96 2010-02-07 X48 Z16 13 1
40 2010-03-18 AD14 20.12 36 1
69 2012-02-21 22.45 3
11 2013-07-03 81 V017 TCG11 1
22 2001-06-01 67 3
83 2005-03-16 80.45 V22.15 46.52 X29.11 0
92 2012-02-12 4
34 2014-03-10 82.12 N72.22 V45.44 1
The last column MissCount will store the number of NAs or empty cells for each row. Any help is much appreciated.
The one-liner
rowSums(is.na(df) | df == "")
given by #DavidArenburg in his comment is definitely the way to go, assuming that you don't mind checking every column in the data frame. If you really only want to check Col1 through Col4, then using an apply function might make more sense.
apply(df, 1, function(x) {
sum(is.na(x[c("Col1", "Col2", "Col3", "Col4")])) +
sum(x[c("Col1", "Col2", "Col3", "Col4")] == "", na.rm=TRUE)
})
Edit: Shortened code
apply(df[c("Col1", "Col2", "Col3", "Col4")], 1, function(x) {
sum(is.na(x)) +
sum(x == "", na.rm=TRUE)
})
or if data columns are exactly like the example data:
apply(df[3:6], 1, function(x) {
sum(is.na(x)) +
sum(x == "", na.rm=TRUE)
})
This should do it.
yourframe$MissCount = rowSums(is.na(yourframe) | yourframe == "" | yourframe == " "))
You can use by_row from library purrr:
library(purrr)
#sample data frame
x <- data.frame(A1=c(1,NA,3,NA),
A2=c("A","B"," ","C"),
A3=c(" "," ",NA,"t"))
Here you apply a function on each row, you can edit it according to your condition. And you can use whatever function you want.
In the following example, I counted empty or NA entries in each row by using sum(...):
by_row(x, function(y) sum(y==" "| (is.na(y))),
.to="MissCount",
.collate = "cols"
)
You will get:
# A tibble: 4 x 4
A1 A2 A3 MissCount
<dbl> <fctr> <fctr> <int>
1 1 A 1
2 NA B 2
3 3 NA 2
4 NA C t 1
We can use
Reduce(`+`, lapply(df, function(x) is.na(x)|!nzchar(as.character(x))))

Find multiple consecutive empty lines

I'm trying to chop up a text file into the articles it contains. Usually this is done by identifying a pattern each article begins with. Unfortunately the database I downloaded the articles from doesn't have that. The only pattern I can find is that after each article there are 3 empty lines.
How could I identify three consecutive empty line?
I know that I can find empty lines with:
Beginnings <- grep('^$', Lines.i)
Beginnings then looks like
> Beginnings[1:50]
[1] 1 2 3 6 8 10 12 13 40 41 42 43 45 49 50 51 53 54 62 63 64 65 67
[24] 69 70 110 111 112 113 115 117 121 122 123 125 131 132 133 135 137 138 150 151 152 153 155
[47] 157 158 169 170
You can see that the first article starts after 1 2 3 and the next one after 41 42 43.
So my idea was to just add the newline expression to the pattern
Beginnings <- grep('^$\n^$\n^$\n', Lines.i)
But this does not work. I would be grateful for any suggestions!
You may try rle
which(inverse.rle(within.list(rle(!nzchar(v1)),
values[lengths<3 & values] <- FALSE)))
#[1] 3 4 5 9 10 11 12
data
v1 <- c('ard', 'b', '', '', '', 'rr', '', 'fr', '', '', '', '', 'gh', 'd')
Here's a solution for extracting the article lines only. Turned out much more complex and cryptic than I'd been hoping, but I'm pretty sure it works. Also, thanks to akrun for the test data.
v1 <- c('ard','b','','','','rr','','fr','','','','','gh','d');
ind <- with(rle(c(rep(F,3),nzchar(v1),rep(F,3))),data.frame(start=cumsum(lengths[-length(lengths)])[values[-1]&!values[-length(values)]&lengths[-length(values)]>=3]-2,end=cumsum(lengths[-length(lengths)])[values[-length(lengths)]&!values[-1]&lengths[-1]>=3]-3));
articles <- lapply(1:nrow(ind),function(r) v1[ind[r,'start']:ind[r,'end']]);
v1;
## [1] "ard" "b" "" "" "" "rr" "" "fr" "" "" "" "" "gh" "d"
ind;
## start end
## 1 1 2
## 2 6 8
## 3 13 14
articles;
## [[1]]
## [1] "ard" "b"
##
## [[2]]
## [1] "rr" "" "fr"
##
## [[3]]
## [1] "gh" "d"

Filter rows based on values of multiple columns in R

Here is the data set, say name is DS.
Abc Def Ghi
1 41 190 67
2 36 118 72
3 12 149 74
4 18 313 62
5 NA NA 56
6 28 NA 66
7 23 299 65
8 19 99 59
9 8 19 61
10 NA 194 69
How to get a new dataset DSS where value of column Abc is greater than 25, and value of column Def is greater than 100.It should also ignore any row if value of atleast one column in NA.
I have tried few options but wasn't successful. Your help is appreciated.
There are multiple ways of doing it. I have given 5 methods, and the first 4 methods are faster than the subset function.
R Code:
# Method 1:
DS_Filtered <- na.omit(DS[(DS$Abc > 20 & DS$Def > 100), ])
# Method 2: which function also ignores NA
DS_Filtered <- DS[ which( DS$Abc > 20 & DS$Def > 100) , ]
# Method 3:
DS_Filtered <- na.omit(DS[(DS$Abc > 20) & (DS$Def >100), ])
# Method 4: using dplyr package
DS_Filtered <- filter(DS, DS$Abc > 20, DS$Def >100)
DS_Filtered <- DS %>% filter(DS$Abc > 20 & DS$Def >100)
# Method 5: Subset function by default ignores NA
DS_Filtered <- subset(DS, DS$Abc >20 & DS$Def > 100)

Add new columns to a data.table containing many variables

I want to add many new columns simultaneously to a data.table based on by-group computations. A working example of my data would look something like this:
Time Stock x1 x2 x3
1: 2014-08-22 A 15 27 34
2: 2014-08-23 A 39 44 29
3: 2014-08-24 A 20 50 5
4: 2014-08-22 B 42 22 43
5: 2014-08-23 B 44 45 12
6: 2014-08-24 B 3 21 2
Now I want to scale and sum many of the variables to get an output like:
Time Stock x1 x2 x3 x2_scale x3_scale x2_sum x3_sum
1: 2014-08-22 A 15 27 34 -1.1175975 0.7310560 121 68
2: 2014-08-23 A 39 44 29 0.3073393 0.4085313 121 68
3: 2014-08-24 A 20 50 5 0.8102582 -1.1395873 121 68
4: 2014-08-22 B 42 22 43 -0.5401315 1.1226726 88 57
5: 2014-08-23 B 44 45 12 1.1539172 -0.3274462 88 57
6: 2014-08-24 B 3 21 2 -0.6137858 -0.7952265 88 57
A brute force implementation of my problem would be:
library(data.table)
set.seed(123)
d <- data.table(Time = rep(seq.Date( Sys.Date(), length=3, by="day" )),
Stock = rep(LETTERS[1:2], each=3 ),
x1 = sample(1:50, 6),
x2 = sample(1:50, 6),
x3 = sample(1:50, 6))
d[,x2_scale:=scale(x2),by=Stock]
d[,x3_scale:=scale(x3),by=Stock]
d[,x2_sum:=sum(x2),by=Stock]
d[,x3_sum:=sum(x3),by=Stock]
Other posts describing a similar issue (Add multiple columns to R data.table in one function call? and Assign multiple columns using := in data.table, by group) suggest the following solution:
d[, c("x2_scale","x3_scale"):=list(scale(x2),scale(x3)), by=Stock]
d[, c("x2_sum","x3_sum"):=list(sum(x2),sum(x3)), by=Stock]
But again, this would get very messy with a lot of variables and also this brings up an error message with scale (but not with sum since this isn't returning a vector).
Is there a more efficient way to achieve the required result (keeping in mind that my actual data set is quite large)?
I think with a small modification to your last code you can easily do both for as many variables you want
vars <- c("x2", "x3") # <- Choose the variable you want to operate on
d[, paste0(vars, "_", "scale") := lapply(.SD, function(x) scale(x)[, 1]), .SDcols = vars, by = Stock]
d[, paste0(vars, "_", "sum") := lapply(.SD, sum), .SDcols = vars, by = Stock]
## Time Stock x1 x2 x3 x2_scale x3_scale x2_sum x3_sum
## 1: 2014-08-22 A 13 14 32 -1.1338934 1.1323092 87 44
## 2: 2014-08-23 A 25 39 9 0.7559289 -0.3701780 87 44
## 3: 2014-08-24 A 18 34 3 0.3779645 -0.7621312 87 44
## 4: 2014-08-22 B 44 8 6 -0.4730162 -0.7258662 59 32
## 5: 2014-08-23 B 49 3 18 -0.6757374 1.1406469 59 32
## 6: 2014-08-24 B 15 48 8 1.1487535 -0.4147807 59 32
For simple functions (that don't need special treatment like scale) you could easily do something like
vars <- c("x2", "x3") # <- Define the variable you want to operate on
funs <- c("min", "max", "mean", "sum") # <- define your function
for(i in funs){
d[, paste0(vars, "_", i) := lapply(.SD, eval(i)), .SDcols = vars, by = Stock]
}
Another variation using data.table
vars <- c("x2", "x3")
d[, paste0(rep(vars, each=2), "_", c("scale", "sum")) := do.call(`cbind`,
lapply(.SD, function(x) list(scale(x)[,1], sum(x)))), .SDcols=vars, by=Stock]
d
# Time Stock x1 x2 x3 x2_scale x2_sum x3_scale x3_sum
#1: 2014-08-22 A 15 27 34 -1.1175975 121 0.7310560 68
#2: 2014-08-23 A 39 44 29 0.3073393 121 0.4085313 68
#3: 2014-08-24 A 20 50 5 0.8102582 121 -1.1395873 68
#4: 2014-08-22 B 42 22 43 -0.5401315 88 1.1226726 57
#5: 2014-08-23 B 44 45 12 1.1539172 88 -0.3274462 57
#6: 2014-08-24 B 3 21 2 -0.6137858 88 -0.7952265 57
Based on comments from #Arun, you could also do:
cols <- paste0(rep(vars, each=2), "_", c("scale", "sum"))
d[,(cols):= unlist(lapply(.SD, function(x) list(scale(x)[,1L], sum(x))),
rec=F), by=Stock, .SDcols=vars]
You're probably looking for a pure data.table solution, but you could also consider using dplyr here since it works with data.tables as well (no need for conversion). Then, from dplyr you could use the function mutate_all as I do in this example here (with the first data set you showed in your question):
library(dplyr)
dt %>%
group_by(Stock) %>%
mutate_all(funs(sum, scale), x2, x3)
#Source: local data table [6 x 9]
#Groups: Stock
#
# Time Stock x1 x2 x3 x2_sum x3_sum x2_scale x3_scale
#1 2014-08-22 A 15 27 34 121 68 -1.1175975 0.7310560
#2 2014-08-23 A 39 44 29 121 68 0.3073393 0.4085313
#3 2014-08-24 A 20 50 5 121 68 0.8102582 -1.1395873
#4 2014-08-22 B 42 22 43 88 57 -0.5401315 1.1226726
#5 2014-08-23 B 44 45 12 88 57 1.1539172 -0.3274462
#6 2014-08-24 B 3 21 2 88 57 -0.6137858 -0.7952265
You can easily add more functions to be calculated which will create more columns for you. Note that mutate_all applies the function to each column except the grouping variable (Stock) by default. But you can either specify the columns you only want to apply the functions to (which I did in this example) or you can specify which columns you don't want to apply the functions to (that would be, e.g. -c(x2,x3) instead of where I wrote x2, x3).
EDIT: replaced mutate_each above with mutate_all as mutate_each will be deprecated in the near future.
EDIT: cleaner version using functional. I think this is the closest to the dplyr answer.
library(functional)
funs <- list(scale=Compose(scale, c), sum=sum) # See data.table issue #783 on github for the need for this
cols <- paste0("x", 2:3)
cols.all <- outer(cols, names(funs), paste, sep="_")
d[,
c(cols.all) := unlist(lapply(funs, Curry(lapply, X=.SD)), rec=F),
.SDcols=cols,
by=Stock
]
Produces:
Time Stock x1 x2 x3 x2_scale x3_scale x2_sum x3_sum
1: 2014-08-22 A 15 27 34 -1.1175975 0.7310560 121 68
2: 2014-08-23 A 39 44 29 0.3073393 0.4085313 121 68
3: 2014-08-24 A 20 50 5 0.8102582 -1.1395873 121 68
4: 2014-08-22 B 42 22 43 -0.5401315 1.1226726 88 57
5: 2014-08-23 B 44 45 12 1.1539172 -0.3274462 88 57
6: 2014-08-24 B 3 21 2 -0.6137858 -0.7952265 88 57

Resources