Replace % and comma in data frame

Replace % and comma in data frame - r

dat <- structure(list(V1 = structure(c(3L, 4L, 1L, 5L, 6L, 1L, 1L, 1L, 1L, 1L),
.Label = c("0,0%", "0,5%", "0,6%", "1,0%", "1,2%", "2,0%", "2,1%", "2,4%",
"3,0%", "3,3%", "4,0%", "5,0%", "7,0%"), class = "factor"),
V2 = structure(c(6L, 7L, 5L, 7L, 7L, 7L, 1L, 1L, 1L, 1L),
.Label = c("0,0%", "12,0%", "2,0%", "2,8%", "3,0%", "3,6%", "4,0%", "4,3%",
"5,0%", "6,0%", "6,4%", "7,0%", "7,9%", "8,0%"), class = "factor"),
V3 = structure(c(3L, 6L, 2L, 16L, 2L, 14L, 1L, 1L, 1L, 1L),
.Label = c("0,0%", "10,0%", "11,7%", "11,9%", "12,0%", "13,0%", "14,0%", "15,0%",
"18,0%", "18,9%", "25,0%", "30,0%", "7,0%", "8,0%", "9,0%", "9,1%"), class = "factor"),
V4 = structure(c(8L, 9L, 4L, 5L, 7L, 3L, 2L, 2L, 2L, 2L),
.Label = c("0,5%", "1,0%","12,0%", "14,0%", "14,3%", "15,0%", "16,0%", "16,3%", "18,0%",
"19,4%", "20,0%", "22,0%", "22,4%", "23,0%", "25,0%", "28,0%",
"28,5%", "30,0%", "35,0%", "50,0%"), class = "factor")),
row.names = c(NA, 10L), class = "data.frame")
I want to do 2 things:
1) Remove the , with decimal .
2) Remove the % symbol
sapply(dat, function(x) as.numeric(gsub("%", "", x)))
sapply(dat, function(x) as.numeric(gsub(",", ".", x)))
Both of them are giving me NAs. What is it I am doing wrong here?

We need to do this in a single step as converting to numeric after removing the % is still a character vector as there is ,. So, use the as.numeric only after doing both the operations
dat[] <- lapply(dat, function(x) as.numeric(gsub("%", "", gsub(",", ".", x))))
If we are using tidyverse
library(tidyverse)
dat %>%
mutate_all(funs(parse_number(str_replace(., ",", "."))))

Thought I would add a tidyverse approach:
library(tidyverse)
dat <- dat %>%
map_df(str_replace, pattern = ",", replacement = ".") %>%
map_df(str_remove, pattern = "%") %>%
map_df(as.numeric)
Definitely not the fastest approach:
mbm <- microbenchmark::microbenchmark(lap = {lapply(dat, function(x)
as.numeric(gsub("%", "", gsub(",", "", x))))},
tidy = {dat %>%
map_df(str_replace, pattern = ",", replacement = ".") %>%
map_df(str_remove, pattern = "%") %>%
map_df(as.numeric)})
This shows that using lapply instead of my tidyverse approach is approximately 10x faster but maybe harder for some to understand.

Related

For loop cycling through all individuals in 1 group but not the other group

The goal of the for loop is to calculate distance between each deer and each cow at every simultaneous time stamp and put it into a data frame. The loop is working for deer 1 and all cattle (deer1- cow1,cow2,cow3...) but it does not loop to deer 2 (deer2- cow1,cow2,cow3...). It stops and produces Error in linfol[[j]] : subscript out of bounds In addition: Warning messages: 1: In min(table(id)) : no non-missing arguments to min; returning Inf 2: In min(table(burst)) : no non-missing arguments to min; returning Inf Any ideas on how to fix this? I appreciate all of your help.
library(lubridate)
require(rgdal)
library(adehabitatHR)
library(rgeos)
library(wildlifeDI)
library(sf)
library(tidyr)
library(purrr)
library(dplyr)
library(ggplot2)
library(rowr)
library(qpcR)
library(tidyverse)
del6 <- structure(list(Id = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L
), .Label = c("A82117", "A82118", "A82119", "A82120", "A628",
"A629", "A630", "A631"), class = "factor"), Species = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("deer", "cow"), class = "factor"),
DateTime = structure(c(1559365200, 1559367000, 1559368800,
1559370600, 1559372400, 1559374200, 1559376000, 1559377800,
1559379600, 1559381400, 1559365200, 1559367000, 1559368800,
1559370600, 1559372400, 1559374200, 1559376000, 1559377800,
1559379600, 1559381400, 1559367000, 1559368800, 1559370600,
1559372400, 1559374200, 1559376000, 1559377800, 1559379600,
1559381400, 1559383200, 1559365200, 1559367000, 1559368800,
1559370600, 1559372400, 1559374200, 1559376000, 1559377800,
1559379600, 1559381400), class = c("POSIXct", "POSIXt"), tzone = "CST6CDT"),
x = c(654371.334599288, 654425.757711813, 654413.001859601,
654396.842641521, 654346.593176651, 654337.090447315, 654334.818175218,
654326.530950149, 654289.118946121, 654261.853959498, 651805.18706951,
651799.382793396, 651810.067280183, 651799.620449496, 651801.683057562,
651816.964086015, 651821.993327341, 651714.361813341, 651693.011227868,
651747.458989254, 652385.114529054, 652374.225278371, 652093.206807523,
652083.440205417, 652092.516704872, 652082.345404556, 652092.556187695,
652084.159078257, 652084.674447443, 652087.858880835, 652907.574768764,
652913.940744582, 652915.348511677, 652902.805542879, 652905.971983537,
652902.58817731, 652860.819066119, 652821.735425028, 652834.71368795,
652834.27029922), y = c(2939470.93183362, 2939450.68389254,
2939464.95474789, 2939471.49537518, 2939472.88154388, 2939478.49457091,
2939481.02639993, 2939460.28537739, 2939318.72673479, 2939260.75137547,
2938855.09928731, 2938836.31751033, 2938839.33629436, 2938838.11516351,
2938842.28331314, 2938829.93458363, 2938834.30422344, 2938857.68619733,
2938936.41572119, 2938907.99144485, 2942314.3327499, 2942310.36910381,
2942154.52809203, 2942165.81205587, 2942159.77141252, 2942159.06281473,
2942160.63606412, 2942162.33067677, 2942160.0434262, 2942160.29193881,
2943229.61402449, 2943227.81804756, 2943239.146907, 2943270.14022283,
2943280.16067867, 2943263.35708588, 2943347.8117451, 2943406.05189864,
2943415.94632734, 2943428.82622347)), row.names = c(NA, -40L
), class = "data.frame")
#subset by animal of interest
deers <- del6 %>%
filter(Species=='deer') %>%
droplevels()
summary(deers)
cows <- del6 %>%
filter(Species=='cow') %>%
droplevels()
summary(cows)
Dist_df<-NA
for(a in 1:length(deers)) {
deersIDs <- unique(deers$Id)
for(b in 1:length(cows)) {
cowsIDs <- unique(cows$Id)
for (i in 1:length(deersIDs)){
deerID <- deersIDs[i]
deer <- filter(deers, Id == deerID)
deer.traj <- as.ltraj(xy = deer[,c("x","y")], date = deer$DateTime,
id=deerID, typeII = T)
for (j in 1:length(cowsIDs)){
cowID <- cowsIDs[j]
cow <- filter(cows, Id == cowID)
cow.traj <- as.ltraj(xy = cow[,c("x","y")], date = cow$DateTime,
id=cowID, typeII = T)
sim <- GetSimultaneous(deer.traj,cow.traj,tc=30*60)
deer.sim <- sim[1]
cow.sim <- sim[2]
dist <- Prox(deer.sim,cow.sim, local=T)
dist <- select(dist,-dt)
Dist_df <- na.omit(Dist_df)
dist$Id <- paste0(deerID[a], cowID[b])
Dist_df<-rbind(Dist_df, dist)}}}}

Consider expand.grid and Map and avoid the four nested for loops, especially avoing the hazard of growing objects in a loop with rbind. See Patrick Burns' R Inferno - Circle 2: Growing Objects.
deers <- del6 %>% filter(Species=='deer') %>% droplevels()
summary(deers)
cows <- del6 %>% filter(Species=='cow') %>% droplevels()
summary(cows)
# GENERALIZED METHOD TO HANDLE EACH PAIR OF DEER AND COW ID
calculate_distance <- function(deerID, cowID) {
deer <- filter(deers, Id == deerID)
deer.traj <- as.ltraj(
xy=deer[,c("x","y")], date=deer$DateTime, id=deerID, typeII=TRUE
)
cow <- filter(cows, Id == cowID)
cow.traj <- as.ltraj(
xy=cow[,c("x","y")], date=cow$DateTime, id=cowID, typeII=TRUE
)
sim <- GetSimultaneous(deer.traj, cow.traj, tc=30*60)
deer.sim <- sim[1]
cow.sim <- sim[2]
dist <- Prox(deer.sim, cow.sim, local=TRUE)
dist <- select(dist, -dt)
dist$Id <- paste0(deerID, "_", cowID)
return(dist)
}
# RETRIEVE ALL PAIRWISE MATCHES OF IDs
cross_join_ids <- expand.grid(
deerID = unique(deers$Id), cowID = unique(cows$Id)
)
# BUILD LIST OF DATA FRAMES
dist_dfs <- Map(
calculate_distance, cross_join_ids$deerID, cross_join_ids$cowID
)
# COMPILE SINGLE DATA FRAME
master_dist <- dplyr::bind_rows(dist_dfs)
For any problematic calculations you can wrap processing in tryCatch to print errors to console and return NULLs (which bind_rows will remove from final compilation):
calculate_distance <- function(deerID, cowID) {
tryCatch({
deer <- filter(deers, Id == deerID)
deer.traj <- as.ltraj(
xy=deer[,c("x","y")], date=deer$DateTime, id=deerID, typeII=TRUE
)
cow <- filter(cows, Id == cowID)
cow.traj <- as.ltraj(
xy=cow[,c("x","y")], date=cow$DateTime, id=cowID, typeII=TRUE
)
sim <- GetSimultaneous(deer.traj, cow.traj, tc=30*60)
deer.sim <- sim[1]
cow.sim <- sim[2]
dist <- Prox(deer.sim, cow.sim, local=TRUE)
dist <- select(dist, -dt)
dist$Id <- paste0(deerID, "_", cowID)
return(dist)
}, error = function(e) {
print(e)
return(NULL)
})
}

Conditional updating coordinate column in dataframe

I am attempting to populate two newly empty columns in a data frame with data from other columns in the same data frame in different ways depending on if they are populated.
I am trying to populate the values of HIGH_PRCN_LAT and HIGH_PRCN_LON (previously called F_Lat and F_Lon) which represent the final latitudes and londitudes for those rows this will be based off the values of the other columns in the table.
Case 1: Lat/Lon2 are populated (like in IDs 1 & 2), using the great
circle algorithm a midpoint between them should be calculated and
then placed into F_Lat & F_Lon.
Case 2: Lat/Lon2 are empty, then the values of Lat/Lon1 should be put
into F_Lat and F_Lon (like with IDs 3 & 4).
My code is as follows but doesn't work (see previous versions, removed in an edit).
The preperatory code I am using is as follows:
incidents <- structure(list(id = 1:9, StartDate = structure(c(1L, 3L, 2L,
2L, 2L, 3L, 1L, 3L, 1L), .Label = c("02/02/2000 00:34", "02/09/2000 22:13",
"20/01/2000 14:11"), class = "factor"), EndDate = structure(1:9, .Label = c("02/04/2006 20:46",
"02/04/2006 22:38", "02/04/2006 23:21", "02/04/2006 23:59", "03/04/2006 20:12",
"03/04/2006 23:56", "04/04/2006 00:31", "07/04/2006 06:19", "07/04/2006 07:45"
), class = "factor"), Yr.Period = structure(c(1L, 1L, 2L, 2L,
2L, 3L, 3L, 3L, 3L), .Label = c("2000 / 1", "2000 / 2", "2000 /3"
), class = "factor"), Description = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "ENGLISH TEXT", class = "factor"),
Location = structure(c(2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L
), .Label = c("Location 1", "Location 1 : Location 2"), class = "factor"),
Location.1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = "Location 1", class = "factor"), Postcode.1 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Postcode 1", class = "factor"),
Location.2 = structure(c(2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L,
1L), .Label = c("", "Location 2"), class = "factor"), Postcode.2 = structure(c(2L,
2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L), .Label = c("", "Postcode 2"
), class = "factor"), Section = structure(c(2L, 2L, 3L, 1L,
4L, 4L, 2L, 1L, 4L), .Label = c("East", "North", "South",
"West"), class = "factor"), Weather.Category = structure(c(1L,
2L, 4L, 2L, 2L, 2L, 4L, 1L, 3L), .Label = c("Animals", "Food",
"Humans", "Weather"), class = "factor"), Minutes = c(13L,
55L, 5L, 5L, 5L, 522L, 1L, 11L, 22L), Cost = c(150L, 150L,
150L, 20L, 23L, 32L, 21L, 11L, 23L), Location.1.Lat = c(53.0506727,
53.8721035, 51.0233529, 53.8721035, 53.6988355, 53.4768766,
52.6874562, 51.6638245, 51.4301359), Location.1.Lon = c(-2.9991256,
-2.4004125, -3.0988341, -2.4004125, -1.3031529, -2.2298073,
-1.8023421, -0.3964916, 0.0213837), Location.2.Lat = c(52.7116187,
53.746791, NA, 53.746791, 53.6787167, 53.4527824, 52.5264907,
NA, NA), Location.2.Lon = c(-2.7493169, -2.4777984, NA, -2.4777984,
-1.489026, -2.1247029, -1.4645023, NA, NA)), class = "data.frame", row.names = c(NA, -9L))
#gpsColumns is used as the following line of code is used for several data frames.
gpsColumns <- c("HIGH_PRCN_LAT", "HIGH_PRCN_LON")
incidents [ , gpsColumns] <- NA
#create separate variable(?) containing a list of which rows are complete
ind <- complete.cases(incidents [,17])
#populate rows with a two Lat/Lons with great circle middle of both values
incidents [ind, c("HIGH_PRCN_LON_2","HIGH_PRCN_LAT_2")] <-
with(incidents [ind,,drop=FALSE],
do.call(rbind, geosphere::midPoint(cbind.data.frame(Location.1.Lon, Location.1.Lat), cbind.data.frame(Location.2.Lon, Location.2.Lat))))
#populate rows with one Lat/Lon with those values
incidents[!ind, c("HIGH_PRCN_LAT","HIGH_PRCN_LON")] <- incidents[!ind, c("Location.1.Lat","Location.1.Lon")]
I will use the geosphere::midPoint function based off a recommendation here: http://r.789695.n4.nabble.com/Midpoint-between-coordinates-td2299999.html.
Unfortunately, it doesn't appear that this way of populating the column will work when there are several cases.
The current error that is thrown is:
Error in `$<-.data.frame`(`*tmp*`, F_Lat, value = integer(0)) :
replacement has 0 rows, data has 178012
Edit: also posted to reddit: https://www.reddit.com/r/Rlanguage/comments/bdvavx/conditional_updating_column_in_dataframe/
Edit: Added clarity on the parts of the code I do not understand.
#replaces the F_Lat2/F_Lon2 columns in rows with a both sets of input coordinates
dataframe[ind, c("F_Lat2","F_Lon2")] <-
#I am unclear on what this means, specifically what the "with" function does and what "drop=FALSE" does and also why they were used in this case.
with(dataframe[ind,,drop=FALSE],
#I am unclear on what do.call and rbind are doing here, but the second half (geosphere onwards) is binding the Lats and Lons to make coordinates as inputs for the gcIntermediate function.
do.call(rbind, geosphere::gcIntermediate(cbind.data.frame(Lat1, Lon1),
cbind.data.frame(Lat2, Lon2), n = 1)))

Though your code doesn't work as-written for me, and I cannot calculate the same precise values your expect, I suspect the error your seeing can be fixed with these steps. (Data is down at the bottom here.)
Pre-populate the empty columns.
Pre-calculate the complete.cases step, it'll save time.
Use cbind.data.frame for inside gcIntermediate.
I'm inferring from
gcIntermediate([dataframe...
^
this is an error in R
that you are binding those columns together, so I'll use cbind.data.frame. (Using cbind itself produced some ignorable warnings from geosphere, so you can use it instead and perhaps suppressWarnings, but that function is a little strong in that it'll mask other warnings as well.)
Also, since it appears you want one intermediate value for each pair of coordinates, I added the gcIntermediate(..., n=1) argument.
The use of do.call(rbind, ...) is because gcIntermediate returns a list, so we need to bring them together.
dataframe$F_Lon2 <- dataframe$F_Lat2 <- NA_real_
ind <- complete.cases(dataframe[,4])
dataframe[ind, c("F_Lat2","F_Lon2")] <-
with(dataframe[ind,,drop=FALSE],
do.call(rbind, geosphere::gcIntermediate(cbind.data.frame(Lat1, Lon1),
cbind.data.frame(Lat2, Lon2), n = 1)))
dataframe[!ind, c("F_Lat2","F_Lon2")] <- dataframe[!ind, c("Lat1","Lon1")]
dataframe
# ID Lat1 Lon1 Lat2 Lon2 F_Lat F_Lon F_Lat2 F_Lon2
# 1 1 19.05067 -3.999126 92.71332 -6.759169 55.88200 -5.379147 55.78466 -6.709509
# 2 2 58.87210 -1.400413 54.74679 -4.479840 56.80945 -2.940126 56.81230 -2.942029
# 3 3 33.02335 -5.098834 NA NA 33.02335 -5.098834 33.02335 -5.098834
# 4 4 54.87210 -4.400412 NA NA 54.87210 -4.400412 54.87210 -4.400412
Update, using your new incidents data and switching to geosphere::midPoint.
Try this:
incidents$F_Lon2 <- incidents$F_Lat2 <- NA_real_
ind <- complete.cases(incidents[,4])
incidents[ind, c("F_Lat2","F_Lon2")] <-
with(incidents[ind,,drop=FALSE],
geosphere::midPoint(cbind.data.frame(Location.1.Lat,Location.1.Lon),
cbind.data.frame(Location.2.Lat,Location.2.Lon)))
incidents[!ind, c("F_Lat2","F_Lon2")] <- dataframe[!ind, c("Lat1","Lon1")]
One (big) difference is that geosphere::gcIntermediate(..., n=1) returns a list of results, whereas geosphere::midPoint(...) (no n=) returns just a matrix, so no rbinding required.
Data:
dataframe <- read.table(header=T, stringsAsFactors=F, text="
ID Lat1 Lon1 Lat2 Lon2 F_Lat F_Lon
1 19.0506727 -3.9991256 92.713318 -6.759169 55.88199535 -5.3791473
2 58.8721035 -1.4004125 54.746791 -4.47984 56.80944725 -2.94012625
3 33.0233529 -5.0988341 NA NA 33.0233529 -5.0988341
4 54.8721035 -4.4004125 NA NA 54.8721035 -4.4004125")

Subset list based on a condition in R

For a sample dataframe:
df <- structure(list(area = structure(c(1L, 4L, 3L, 8L, 5L, 7L, 6L,
2L), .Label = c("DE1", "DE3", "DE4", "DE5", "DE9", "DEA", "DEB",
"DEC"), class = "factor"), to.delete = c(1L, 0L, 1L, 0L, 1L,
1L, 1L, 0L)), .Names = c("area", "to.delete"), class = "data.frame", row.names = c(NA,
-8L))
I want to create a list of the areas which have a '1' in the 'to'delete' column. I know how to subset the 1s out of this dataframe, however I want the list of areas as eventually I will use this list to extract these areas from the main master data file (df2, listed below).
df2 <- structure(list(id = 1:24, area = structure(c(1L, 1L, 4L, 4L,
4L, 3L, 3L, 3L, 3L, 3L, 8L, 8L, 8L, 8L, 5L, 7L, 7L, 7L, 6L, 6L,
2L, 2L, 2L, 2L), .Label = c("DE1", "DE3", "DE4", "DE5", "DE9",
"DEA", "DEB", "DEC"), class = "factor")), .Names = c("id", "area"
), class = "data.frame", row.names = c(NA, -24L))
I prefer to do this in two steps, so I can easily see which areas I have deleted (thanks to answers below for suggestions of using list).
a <- list(df$area[df$to.delete == 1])
df2.subset <- df2[df2$area %in% a,]
This however doesn't seem to work at the moment, so if anyone has any ideas, then that would be great.
df2 should then be left with only areas DE5, DEC and DE3.
Many thanks.

Here is another method using split to collect the areas into two lists:
# get two lists of areas and give list items appropriate names
keepDrop <- setNames(split(df$area, df$to.delete), c("drop", "keep"))
# now perform dropping
df2.smaller <- df2[df2$area %in% keepDrop[["keep"]],]

We can use subset. Based on the description, the OP wants to subset the rows of a main data ('maindata') based on the 'area' that corresponds to 1 in 'to.delete' column. In that case, we extract the 'area' (df$area[df$to.delete ==1]) and with %in% we subset the 'maindata'.
subset(maindata, area %in% df$area[df$to.delete==1])

It's not too clear what you are asking.
This will create a list where each element is a different Area:
lapply(df$area[df$to.delete == 1], function(x) x)
If you want a list with just one element containing all the areas:
list(df$area[df$to.delete == 1])
Edit:
To answer the second part of your question:
a <- list(df$area[df$to.delete == 1])
df2.subset <- df2[!df2$area %in% a[[1]], ]

Here's what you can try .
a <- as.list(subset(df,df$to.delete == 1))
> a
$area
[1] DE1 DE4 DE9 DEB DEA
Levels: DE1 DE3 DE4 DE5 DE9 DEA DEB DEC
$to.delete
[1] 1 1 1 1 1

how to find similar strings within a data

My data looks like this
df<- structure(list(A = structure(c(7L, 6L, 5L, 4L, 3L, 2L, 1L, 1L,
1L), .Label = c("", "P42356;Q8N8J0;A4QPH2", "P67809;Q9Y2T7",
"Q08554", "Q13835", "Q5T749", "Q9NZT1"), class = "factor"), B = structure(c(9L,
8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("P62861", "P62906",
"P62979;P0CG47;P0CG48", "P63241;Q6IS14", "Q02413", "Q07955",
"Q08554", "Q5T749", "Q9UQ80"), class = "factor"), C = structure(c(9L,
8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("", "P62807;O60814;P57053;Q99879;Q99877;Q93079;Q5QNW6;P58876",
"P63241;Q6IS14", "Q02413", "Q16658", "Q5T750", "Q6P1N9", "Q99497",
"Q9UQ80"), class = "factor")), .Names = c("A", "B", "C"), class = "data.frame", row.names = c(NA,
-9L))
I want to count how many elements are in each columns including those that are separated with a ; , for example in this case
first column has 9, second column has 12 elements and the third column has 16 elements. then I want to check how many times a element is repeated in other columns . for example
string number of times columns
Q5T749 2 1,2
then remove the strings which are seen more than once from the df

One way to approach this is to start by re-organizing the data into a form that is more convenient to work with. The tidyr and dplyr packages are useful for that sort of thing.
library(tidyr)
df$index <- 1:nrow(df)
df <- gather(df, key = 'variable', value = 'value', -index, na.rm = TRUE)
df <- separate(df, "value", into = paste("x", 1:(1 + max(nchar(gsub("[^;]", "", df$value)))), sep = ""), sep = ";", fill = "right")
df <- gather(df, "which", "value", -index, -variable)
Once you do that counting each element is easy:
addmargins(t(table(df[, c("variable", "value")])), margin = 2)
Dropping duplicates is also easy.
df <- df[!duplicated(df$value), ]
If you really want to put the data back into the original for you can (though I don't recommend it).
df <- spread(df, key = "variable", value = "value")
library(dplyr)
summarize(group_by(df, index),
A = paste(na.omit(A), collapse = ";"),
B = paste(na.omit(B), collapse = ";"),
C = paste(na.omit(C), collapse = ";"))

For the count of elements in each column use this
sapply(df,function(x) length(unlist(sapply(strsplit(as.character(x),"\\s+"),strsplit,split=";"))))
For counting the repetition use this
words <- lapply(df,function(x) unlist(sapply(strsplit(as.character(x),"\\s+"),strsplit,split=";")))
dup_table <- table(unlist(words))
dup_table
There is a very bad approach to remove the repetition
pat <- names(dup_table)[unname(dup_table)>1]
for(i in pat)
df <- as.data.frame.list(lapply(df,function(x) gsub(pattern = i,replacement = "",x)))
But, there is only one problem. It will replace all the occurences of a particular pattern.

Extracting data frames from a list based on column names in r

I am looking at extracting df's from within a list of multiple df's into separate data frames based on a condition (if the column names of a df within the list contains the name I am looking for).
For illustration purposes I have created an example which resembles the situation I am in.
I have list with multiple data frames and the dput of that list is given below:
structure(list(V1 = structure(list(lvef = c(0.965686195194885,
0.0806777632648268, -0.531729196500083, -0.511913109608259, -0.413670941196816,
-0.0501899795864357, -0.337583918771946, 1.16086745780346, -0.478358865835724,
-1.95009138673888), hbc = c(-0.389950511350405, -0.904388183933348,
0.811821977223064, -0.868381700124344, -0.637307418402866, -1.04703715824204,
-0.394340445217658, -0.194653869597247, 0.00822402232044511,
-0.145032587618231), id = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = "NA", class = "factor")), .Names = c("lvef",
"hbc", "id"), row.names = c(NA, -10L), class = "data.frame"),
V2 = structure(list(ersta = c(-0.254360310986174, 0.3859806928747,
-0.135741797055127, 1.03929145413636, -0.484219739337178,
0.255476285148917, 1.0479422937128, 0.146613094683722, -0.914377222535014,
1.75052418161618, -0.275059500684816, 2.34861397588234, 0.00183723766664941,
0.97612891408903, 0.278868537504227, 0.456979477254684, 1.46323739326792,
0.664511602217853, 0.870420202897545, 1.38228375734407),
pgrsta = c(-1.49129812271989, 0.820330747101906, -0.0469488167129374,
0.471549380446308, -1.71312120132398, 0.0578140025416816,
1.67016363826724, 0.226180835709491, -2.00294530465909,
-0.0464857361954717, 0.306942902768782, -0.785096914460742,
0.283822632249141, -0.260774679911329, -1.2865970194309,
0.307972619170242, 0.223715024597144, -1.01642533651475,
-0.12229427204957, 0.223326519096996), id = structure(c(7L,
7L, 7L, 7L, 4L, 1L, 3L, 5L, 6L, 2L, 7L, 7L, 7L, 7L, 4L,
1L, 3L, 5L, 6L, 2L), class = "factor", .Label = c("-0.10863576856322",
"-0.317324527228699", "-0.422764348315332", "0.285132258310185",
"1.23305496219042", "1.39326602279981", "NA"))), .Names = c("ersta",
"pgrsta", "id"), row.names = c(NA, -20L), class = "data.frame"),
V3 = structure(list(hormrec = 1:15, event = structure(c(10L,
10L, 10L, 10L, 10L, 10L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L), .Label = c("1", "2", "3", "4", "5", "6", "7", "8", "9",
"NA"), class = "factor")), .Names = c("hormrec", "event"), row.names = c(NA,
-15L), class = "data.frame"), V4 = structure(list(asat = c(-0.321423784000631,
0.181345361079582, 0.389158724418319, -1.15251833725336,
-0.351981383678293, -0.506888212379408, 0.870705917350059,
-0.626883041051641, -0.321843006223371, -0.674564527029912,
-0.609383943267379, -0.181661119817784, -1.63676077872658
), lab = structure(c(1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 2L), .Label = c("btest", "NA", "rtest"), class = "factor")), .Names = c("asat",
"lab"), row.names = c(NA, -13L), class = "data.frame")), .Names = c("V1",
"V2", "V3", "V4"))
I am trying to extract data frames from the list based on the condition that if a data frame within the list contains the column name/s required then that data frame from the list should go into a separate data frame. So far, I have been able to extract the data frames into a list using the following code:
# function to extract required df's
trial <- function(x)
{
reqname <- c("hbc","ersta") # column names to check for
data <- x
lapply(seq(data), function(i){ # loop through all the data frames in the list
y <- data.frame(data[[i]]) # extract df in y
names <- names(y) # extract names of df
for(a in 1:length(reqname)) # loop through the length of reqname
{
if(reqname[a]%in%names) # check if column name/s present in current df
{
z <- y # extract df into another df
return(z) # return df
}
}
}
)
}
The above function returns a list of matching df's along with nulls where there was not a match. I am looking for a modification so that the selected data frame comes out separately. If there are two df's matching the requirement then the output should be two separate data frames.
I will appreciate all and any help in finding a solution.

You can easily use the lapply() plus a custom function to identify wanted outputs. For instance, if k is your list,
trial <- function(x)
{
reqnames <- c("hbc","ersta")
k <- lapply(k, function(x) any(names(x) %in% reqnames))
k <- which(k==1)
x[k]
}
This outputs a list with only the dataframes containing at least one of the names in reqnames.

We can remove the NULL elements with Filter
lst1 <- Filter(length, trial(lst))
If we need multiple data.frame objects in the global environment, use list2env after renaming the list elements with the object names
names(lst1) <- paste0('dat' seq_along(lst1))
list2env(lst1, envir = .GlobalEnv)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Replace % and comma in data frame - r

Related

For loop cycling through all individuals in 1 group but not the other group

Conditional updating coordinate column in dataframe

Subset list based on a condition in R

how to find similar strings within a data

Extracting data frames from a list based on column names in r

Categories

Resources