R programming. Dataframe and subsets - r

Select only unique sets from a dataframe
here one set = one row of data frame.
syntax in r?
I want set concepts
see this example
1 1 2
1 2 1
1 2 3
o/p:
1 1 2
1 2 3
Here row1 and row2 form the sets ={1,2}, so I need only one copy of such rows.
This is my code for apriori algorithm. The function trim(data,r) is what i'hv been trying as a solution,but isn't working out.
uniqueItemSets<-function(data){
#unique items in basket
items <- c()
for(j in c(1:ncol(data))){
items <- c(items,unique(data[,j]))
}
items <- unique(items)
#return(as.list(items))
return(items)
}
F_itemset<-function(data,candidate,sup){
count <- rep(0,nrow(candidate))
for(i in c(1:nrow(data))){ #every transaction
for(j in c(1:nrow(candidate))){ #every dataset
x <- candidate[j,]
#x <- uniqueItemSets(x)
y <- data[i,]
#y <- uniqueItemSets(y)
if(all(x %in% y)){
count[j] <- count[j] + 1
}
}
}
#pruning
pp<-cbind(candidate,count)
pp<-as.data.frame(pp)
pp<-subset(pp,pp$count>=sup)
return(pp)
}
#k-itemset :k-value
makeItemSet<- function(candidate,k){
l<-combn(candidate,k,simplify=FALSE)
return(l)
}
aprio<-function(data,sup,conf,kmax){
C <- uniqueItemSets(data)
C <- as.data.frame(C)
for(k in c(2:kmax))
{
F <- F_itemset(data,C,sup)
F$count <- NULL
if(nrow(F)<k){
break;
}
F<-t(F)
C <- combn(F,k,simplify=FALSE)
C <- as.data.frame(C)
C <- t(C) #transpose
C<-unique(C)
trim(C,1)
}
return(F)
}
**
new <- data.frame()
trim<-function(data,r)
{
x<-as.data.frame(data[r,])
c<-c()
for(j in c(1:ncol(x))){
c<-c(c,x[,j])
}
c<-unique(c)
if(r+1<=nrow(data)){
for(i in c((r+1):nrow(data))){
t<-c()
for(j in c(1:ncol(data))){
t<-c(t,data[i,j])
}
t<-unique(t)
if(all(t %in% c) && all(c %in% t))
{
data[-i,]
}
}
new <- as.data.frame(data)
if(r+1 < nrow(data)){
trim(data[r+1:nrow(data),],r+1)
}
}
}
**

You can use apply with margin = 1 to execute row wise functions. The only thing to be aware of is that you need to transpose the outcome to get the order you need
d <- data.frame(number1 = c(1,1,1),
number2 = c(1,2,2),
number3 = c(2,1,3))
# next two statements can be run in one line of code if you want
d_sort <- t(apply(d, 1, sort))
# get rid of duplicate rows
unique(d_sort)
[,1] [,2] [,3]
[1,] 1 1 2
[2,] 1 2 3

Related

Psych's reverse.code function producing NAs in R

When using reverse.code in R, the values in my ID column (which are not meant to be reversed) turn into NA once the ID value exceeds 999 (I have 10,110 observations).
Does anyone know if there is anything I can do to fix this?
Is there another function I can use to reverse these items without loosing data?
Here is my code:
library(psych)
keys <- c(1,-1,-1,-1) #Where column 1 = ID and the rest are my variables to be reversed
rev_dat2 <- reverse.code(keys, rev_dat)
Thanks!
Here is the relevant line of the source code of reverse.code(), where new is the object holding the reverse-coded data:
new[abs(new) > 999] <- NA
As you can see, setting values larger than 9999 to missing is hard-coded into the routine. You could write a new version of the function that didn't do that. For example, in the function below, we just make a much larger threshold:
my.reverse.code <- function (keys, items, mini = NULL, maxi = NULL)
{
if (is.vector(items)) {
nvar <- 1
}
else {
nvar <- dim(items)[2]
}
items <- as.matrix(items)
if (is.null(maxi)) {
colMax <- apply(items, 2, max, na.rm = TRUE)
}
else {
colMax <- maxi
}
if (is.null(mini)) {
colMin <- apply(items, 2, min, na.rm = TRUE)
}
else {
colMin <- mini
}
colAdj <- colMax + colMin
if (length(keys) < nvar) {
temp <- keys
if (is.character(temp))
temp <- match(temp, colnames(items))
keys <- rep(1, nvar)
keys[temp] <- -1
}
if (is.list(keys) | is.character(keys)) {
keys <- make.keys(items, keys)
keys <- diag(keys)
}
keys.d <- diag(keys, nvar, nvar)
items[is.na(items)] <- -99999999999
reversed <- items %*% keys.d
adj <- abs(keys * colAdj)
adj[keys > 0] <- 0
new <- t(adj + t(reversed))
new[abs(new) > 99999999999] <- NA
colnames(new) <- colnames(items)
colnames(new)[keys < 0] <- paste(colnames(new)[keys < 0],
"-", sep = "")
return(new)
}
The reason they used a numeric value threshold is that for the recoding they do to work, they needed all values to be numeric. So, they set missing values to -999 and then later turn them back into missing values. The same is done above, but with a lot bigger number.
keys <- c(1,-1,-1,-1) #Where column 1 = ID and the rest are my variables to be reversed
rev_dat <- data.frame(
id = 9998:10002,
x = 1:5,
y = 5:1,
z = 1:5
)
library(psych)
reverse.code(keys, rev_dat)
# id x- y- z-
# [1,] NA 5 1 5
# [2,] NA 4 2 4
# [3,] NA 3 3 3
# [4,] NA 2 4 2
# [5,] NA 1 5 1
my.reverse.code(keys, rev_dat)
# id x- y- z-
# [1,] 9998 5 1 5
# [2,] 9999 4 2 4
# [3,] 10000 3 3 3
# [4,] 10001 2 4 2
# [5,] 10002 1 5 1

Error in if (a[i][j] > 4) { : missing value where TRUE/FALSE needed

Find the number of entries in each row which are greater than 4.
set.seed(75)
aMat <- matrix( sample(10, size=60, replace=T), nr=6)
rowmax=function(a)
{
x=nrow(a)
y=ncol(a)
i=1
j=1
z=0
while (i<=x) {
for(j in 1:y) {
if(!is.na(a[i][j])){
if(a[i][j]>4){
z=z+1
}
}
j=j+1
}
print(z)
i=i+1
}
}
rowmax(aMat)
It is showing the error. I don't want to apply in built function
You could do this easier counting the x that are greater than 4 using length.
rowmax2 <- function(x) apply(x, 1, function(x) {x <- na.omit(x);length(x[x > 4])})
rowmax2(aMat)
# [1] 8 7 8 7 4 3
If you wanted to do this absolutely without any shortcut you could use two for loops. 1 for each row and another for each value in the row.
rowmax = function(a) {
y=nrow(a)
result <- numeric(y)
for(j in seq_len(y)) {
count = 0
for(val in a[j, ]) {
if(!is.na(val) && val > 4)
count = count + 1
}
result[j] <- count
}
return(result)
}
rowmax(aMat)
#[1] 8 7 8 7 4 3
If you wanted to do this using in-built functions in base R you could use rowSums.
rowSums(aMat > 4, na.rm = TRUE)
#[1] 8 7 8 7 4 3
There are several errors in you code:
You should put z <- 0 inside while loop
You should use a[i,j] for the matrix indexing, rather than a[i][j]
Below is a version after fixing the problems
rowmax <- function(a) {
x <- nrow(a)
y <- ncol(a)
i <- 1
j <- 1
while (i <= x) {
z <- 0
for (j in 1:y) {
if (!is.na(a[i, j])) {
if (a[i, j] > 4) {
z <- z + 1
}
}
j <- j + 1
}
print(z)
i <- i + 1
}
}
and then we get
> rowmax(aMat)
[1] 8
[1] 7
[1] 8
[1] 7
[1] 4
[1] 3
A concise approach to make it is using rowSums, e.g.,
rowSums(aMat, na.rm = TRUE)

Implement table() function as a user defined function

x <- c(1,2,3,2,1)
table(x)
# x
# 1 2 3
# 2 2 1
Outputs how many times each element occur in the vector.
I am trying to imitate the above function using function()
Below is my code:
TotalTimes = function(x){
times = 0
y = unique(x)
for (i in 1:length(y)) {
for (i in 1:length(x)) {
if(y[i] == x[i])
times = times + 1
}
return(times)
}
}
What would be the right approach?
Here's a one-liner, using rle():
f <- function(x) {
with(rle(sort(x)), setNames(lengths, values))
}
f(c(1,2,3,2,1))
# 1 2 3
# 2 2 1
Alternatively, here's an option that's less "tricky", and is probably a better model for learning to code in an R-ish way:
f2 <- function(x) {
ss <- sort(x)
uu <- unique(ss)
names(uu) <- uu
sapply(uu, function(u) sum(ss == u))
}
f2(c(1,2,3,2,1))
# 1 2 3
# 2 2 1
function(x) {
q = sapply(unique(x), function(i) sum(x == i))
names(q) = unique(x)
return(q)
}
Here is one method using base R:
# data
x <- c(1,2,3,2,1)
# set up
y <- sort(unique(x))
counts <- rep_len(0, length.out=length(y))
names(counts) <- y
for(i in seq_along(x)) {
counts[x[i] == y] <- counts[x[i] == y] + 1
}
Wrapping it in a function:
table2 <- function(x) {
# transform x into character vector to reduce search cost in loop
x <- as.character(x)
y <- sort(unique(x))
counts <- rep_len(0, length.out=length(y))
names(counts) <- y
for(i in seq_along(x)) {
counts[x[i]] <- counts[x[i]] + 1L
}
return(counts)
}
This version only accepts a single vector, of course. At #Frank's suggestion, the function version is slightly different, and possibly faster, in that it transforms the input x into a character. The potential speed up is in the search in counts[x[i]] where the name in counts is referred to (as x[i]), rather than performing a search using "==."

Excluding rows with NAs in column-wise calculations

I have the below function:
colNames = c(1,4)
myfun = function(a,b){
test$result = 0.0
for (i in colNames)
{
test$result = test$result + (test[,i] * exp(-a*test[,i+1]) * exp(b*test[,i+2]))
}
return(test$result)
}
I am basically trying to multiply 3 columns in a sequence (by performing exp operation on i+1 and i+2th columns and multiplying them with col i) and adding their result to a similar operation done to the next 3 columns.
However, I have several null values and whenever I encounter a row in test[,i] with a null value, I want to exclude it from the calculation and perform the next loop.
I mean rows with null values in test[,i] should not be used in the calculation of test$result. Is there anyway to do this?
Sample data:
2 1708.637715 21.30199589 1 408.4464296 19.8614872
1 1708.637715 21.30199589 1 408.4464296 19.8614872
2 1708.637715 21.30199589 1 408.4464296 19.8614872
1 1708.637715 21.30199589 1 408.4464296 19.8614872
6 1708.637715 21.30199589 NA 408.4464296 19.8614872
0 1708.637715 21.30199589 NA 408.4464296 19.8614872
My first iteration should run normally, but in the next iteration only columns 1 to 4 have to be used in the addition
Please help
You simply have to filter out any rows with NA before you enter the loop. To do that the code would be:
test <- test[!apply(is.na(test), 1, any),]
So then if you alter the function to:
new.myfun = function(a,b){
test <- test[!apply(is.na(test), 1, any),]
test$result = 0.0
for (i in colNames)
{
test$result = test$result + (test[,i] * exp(-a*test[,i+1]) * exp(b*test[,i+2]))
}
return(test$result)
}
new.myfun(1,1)
With the output:
[1] 1.736616e-169 1.736616e-169 1.736616e-169 1.736616e-169
Which is hopefully what you're trying to achieve.
You can explicitly iterate through rows (or use apply function):
new.myfun = function(a,b){
check.for.na <- function(x,y,z, a, b) {
if(any(is.na(x), is.na(y), is.na(z))){
return(0)
}
return(x*exp(-a*y)*exp(-b*z))
}
result = rep(0, length(test))
for (ROW in 1:length(test)){
for (i in colNames)
{
check_here_for_na <- check.for.na(test[ROW,i], test[ROW,i+1], test[ROW,i+2], a, b)
result[ROW] = result[ROW] + check_here_for_na
}
}
return(result)
}
new.myfun(1,1)

multiple next statements in function

I'm trying to run a function much more complex but equal to:
A <<- 5
B <<- 5
table <- data.frame(A,B)
backup <- table
test <- function(A,B){
while(A > 0){
r <- runif(1,0,1)
if ((r >= 0)&(r <= 0.5)){
A <- A + 1
B <- B - 1
} else if ((r > 0.5)&(r<=1)){
A <- A - 1
B <- B + 1
}
tab <- cbind(A,B)
table <<- rbind(table,tab)
if (nrow(table) == 10) {
break
} else if ((A > 0) & (nrow(table) != 10)) {
next
} else {if ((A == 0) & (nrow(table != 10)) { #pointing to error here?
A <- 5
B <- 5
table <- backup
next
}
}
}
So what I want this function to do is stop when the when number of rows of the table (= the number of times the function ran) is equal to a certain value, in this case 10. But A cannot take a value below 0. If A reaches 0 before the number of rows of the table is 10 the whole process has to start again, with the same inputvalues as before.
However, this function does not work. I think it's because I use multiple next statements, is that correct?
Thanks!
I think you're on the right track... just a few recommendations
I think this accomplishes what you're trying to achieve a little cleaner. I second Carl's suggestion of avoiding the global operator '<<-' when possible and passing the objects through to the function as arguments or parameters. I also second Justin's suggestion of avoiding the break command in favor of placing the return() call smartly. To accomplish this, I put your 10 (stopping criteria) directly into the while loop. I included it as a parameter n so that you can experiment with different stopping criteria. Hope this helps :)
test <- function(A,B,n){
A0 <- A
B0 <- B
tab <- data.frame(A,B)
while(A > 0 & nrow(tab) < n){
r <- runif(1,0,1)
if (r <= 0.5){
A <- A + 1
B <- B - 1
} else {
A <- A - 1
B <- B + 1
}
tab1 <- cbind(A,B)
tab <- rbind(tab,tab1)
if(nrow(tab)<n & A==0){
A<-5
B<-5
tab <- cbind(A0,B0)
names(tab) <- c('A', 'B')
print(paste('found A==0 in less than ', n, ' iterations', sep=''))
}
}
return(tab)
}
Testing function...
## running function
A <- 5
B <- 5
testDF <- test(A,B,10)
> testDF
A B
1 5 5
2 6 4
3 5 5
4 6 4
5 7 3
6 8 2
7 9 1
8 10 0
9 9 1
10 10 0

Resources