Split arbitrary column into melted data frame - r

I have a data.frame with an ugly column with structured data. Each Column can hold from 1 to 40 values of interest. Each value is separated with a html break "<br />". The extracted value as the form of a 1.1, i.e. an integer a period and another integer.
How to separate and melt these columns into different rows?
I know lapply and tidy::separate probably are the ways to go. But I have not succeeded yet. So asking for help.
testdata is here:
testdata <- dget("http://pastebin.com/download.php?i=VS2cq2rB")
The data frame hold two coloumns: "id", and "moduler".
I'd like to have "id" and "value" instead. The end result should be something like this.
"id", "value"
1, 1.1
1, 1.2
1, 1.3
1, 2.4
2, 1.1
2, 1.3
2, 3.3
This it my latest take - pretty far from where I started with lapply.
origdf <- data.frame()
#names(newdf) <- c("id", 'pnummer', 'moduler')
for (i in 1:nrow(hs)) {
newdf <- data.frame()
newdf[i, 'id'] <- hs[i, 'id']
newdf[i, 'pnummer'] <- hs[i, 'pnummer']
tmp <- unlist(strsplit(as.character(hs[i,'moduler']), "<br />", fixed=T))
for (m in 3:length(tmp)+3) {
newdf[i, m] <- tmp[m]
}
origdf <- dplyr::bind_rows(newdf, origdf)
}

Here's a possible data.table approach. Basically I'm just splitting moduler by "<br />" or "<br />Installationsmontør" by id
library(data.table)
setDT(testdata)[, .(value = unlist(strsplit(as.character(moduler),
"<br />|<br />Installationsmontør"))), by = id]
# id value
# 1: 2862 1.1
# 2: 2862 1.2
# 3: 2862 1.3
# 4: 2862 1.4
# 5: 2862 1.5
# ---
# 132: 2877 3.6
# 133: 2877 4.1
# 134: 2877 4.4
# 135: 2877 4.5
# 136: 2877 4.6
Or similarly with the splitstackshape package
library(splitstackshape)
cSplit(testdata, splitCols = "moduler",
sep = "<br />|<br />Installationsmontør",
direction = "long", fixed = FALSE, stripWhite = FALSE)

I would try to use strsplit function with a simple loop:
newdata <- NULL
a <- 1
b <- 0
for (k in 1:length(testdata$moduler)) {
M <- unlist(strsplit(as.character(testdata$moduler[k]),"<br />|<br />Installationsmontør"))
b <- b + length(M)
newdata$moduler[a:b] <- M
newdata$id[a:b] <- testdata$id[k]
a <- b + 1
}
newdata <- as.data.frame(newdata)

Here is another option using unnest from tidyr. We extract the numeric part ([0-9.]+) using str_extract_all from library(stringr). The output is a list. We set the names of the list elements as the 'id' column of 'testdata' and unnest
library(tidyr)
library(stringr)
res <- unnest(setNames(lapply(str_extract_all(testdata$moduler, '[0-9.]+'),
as.numeric), testdata$id), id)
colnames(res)[2] <- 'value'
head(res)
# id value
#1 2862 1.1
#2 2862 1.2
#3 2862 1.3
#4 2862 1.4
#5 2862 1.5
#6 2862 1.6
dim(res)
#[1] 136 2
Or a base R approach would be to extract the numeric elements with regmatches/gregexpr in a list, get the length of the list element with lengths, replicate the 'id' column from 'testdata' based on that, unlist the 'lst' and create a new 'data.frame'.
lst <- lapply(regmatches(testdata$moduler, gregexpr('[0-9.]+',
testdata$moduler)), as.numeric)
res2 <- data.frame(id = testdata$id[rep(1:nrow(testdata), lengths(lst))],
value= unlist(lst))

Related

How to parse text data into data.table in a tidy form?

I have a text file that contains image pixel values as below:
#1 nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,1.225475199490155542e-01,1.044848965437222138e-01,1.237544502265838786e-01,1.715363333404669177e-01,1.922596029233400172e-01,1.809632738682011854e-01,1.797130234316194342e-01,1.738541208375123936e-01,1.444294554581726231e-01,1.321258390981746855e-01,1.344635498234532101e-01,1.436132527743466947e-01,1.395290556225499690e-01,1.374780604935658956e-01,1.346506483347080507e-01,1.280550646990075425e-01,1.248504215497622527e-01,1.178248061901537996e-01,1.298443201619972898e-01,1.553180115989083732e-01,1.580724143044860419e-01,1.784962367422186780e-01,1.907025124594779186e-01,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan
#2 nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,1.029349857154064074e-01,9.448919637788849579e-02,1.059611529861169132e-01,1.123315418475298866e-01,1.044427274454799576e-01,1.201363996329007783e-01,1.282688456719490722e-01,1.251468493081038524e-01,1.305904505950917782e-01,1.166948019212366294e-01,1.099250506785318382e-01,1.136641770357243175e-01,1.130515076243375772e-01,1.184654413023679964e-01,1.445082878208643895e-01,1.663965434098903795e-01,1.663395733842590318e-01,1.752476275152526075e-01,1.685796922638230499e-01,1.482366311004082449e-01,1.309908022384465853e-01,1.261424559469170870e-01,1.268358150633545067e-01,1.255352810594060065e-01,1.259829554332418666e-01,1.289792505226832475e-01,1.297540150693830274e-01,1.209480533761810861e-01,1.285694058734546119e-01,1.369298058593048373e-01,1.461700389952401702e-01,1.431042116739904002e-01,1.712214395634834019e-01,1.818925300859868255e-01,2.010257021882600748e-01,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan
#3 nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,9.861446120242163549e-02,9.304676318969960780e-02,9.864122376278822157e-02,1.075597393605647739e-01,1.131419975961711483e-01,1.146375133556569031e-01,1.204342658911874697e-01,1.228412754806565282e-01,1.240924670494341492e-01,1.163476394083799020e-01,1.073797480686657368e-01,1.017817224886293226e-01,1.131027905414023760e-01,1.114406335131803705e-01,1.227824308916071611e-01,1.329011478552513670e-01,1.441114715371090704e-01,1.604792748573601047e-01,1.527513461191236099e-01,1.380147589010027598e-01,1.288032806310404343e-01,1.338005227090968141e-01,1.255554854466473802e-01,1.173452604805394900e-01,1.143985402480809654e-01,1.202454679138123678e-01,1.267178125929230847e-01,1.241315491837501339e-01,1.347653795894559747e-01,1.349437732217280139e-01,1.301418957926175068e-01,1.313508293861232468e-01,1.742619338497762571e-01,1.858488867892321983e-01,1.877861224975270471e-01,1.803044688712685528e-01,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan
#4 nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,8.736886296542736852e-02,8.908654375958220684e-02,8.620668521597033007e-02,9.500020858506200150e-02,1.126136404574935440e-01,1.187951696788534656e-01,1.168147013779436694e-01,1.109278058355442492e-01,1.128276541805584010e-01,1.173942164407098532e-01,1.152133179543410046e-01,1.111828410014303326e-01,1.192855572113103724e-01,1.157219419285210882e-01,1.051462987022579870e-01,1.042841664852307976e-01,1.263179021208208075e-01,1.543027512926945510e-01,1.531517647661817527e-01,1.370377223529097022e-01,1.217984978313198102e-01,1.340931752979427627e-01,1.274053299614930634e-01,1.206931794950223541e-01,1.149389700113669505e-01,1.083743218115938711e-01,1.135429261076744967e-01,1.224571336189042570e-01,1.316256830092336905e-01,1.296892050846524258e-01,1.220541991422918193e-01,1.251462726710364792e-01,1.475487955738131740e-01,1.8
.
.
.
.
It has a matrix of values in the txt file, one value corresponding to a pixel. Each row is split as above. (scroll to the right)
When I read the file in R as:
txt <- read.table("ndvi_20180102_081439_1005_3B.txt")
It produces a data.frame as below:
V1
#1 nan,nan,nan,nan,-0.131231,nan,nan,nan,....
#2 nan,nan,nan,1.2323,nan,nan,-1,2313,nan,....
.
.
.
#187 nan,nan,nan,1.12323,nan,nan,...
#188 nan,nan,0.2323,nan,nan,...
I want it in this form to calculate the mean of pixel values:
#1 nan
#2 nan
#3 -1,23232
#4 nan
.
.
.
.
I tried to separate with tidyverse::separate but I don't want to calculate the number of variables since I need to do it for about 439 files in a loop.
Finally, I want this form:
#file1 #file2 #file3 ...... #file439
#1 nan nan nan
#2 nan nan nan
#3 nan -1,32 nan
#4 -1,3 0,22 nan
.
.
.
How can I convert the text in the desired form?
An easy workaround for this would be to use the fread function from the data.table package.
txt <- fread(file = "ndvi_20180102_081439_1005_3B.txt",sep = ",")
To get the mean along each column you can use
txt[,lapply(X = .SD,FUN = mean),.SDcols = colnames(txt)]
Hope that helps
data.table's fread mentioned by #Rage is a good choice. We need to take a little effort to deal with your first column or rather header "#1 nan" which is separated by spaces:
library(data.table)
x <- "#1 nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,1.225475199490155542e-01,1.044848965437222138e-01,1.237544502265838786e-01,1.715363333404669177e-01,1.922596029233400172e-01,1.809632738682011854e-01,1.797130234316194342e-01,1.738541208375123936e-01,1.444294554581726231e-01,1.321258390981746855e-01,1.344635498234532101e-01,1.436132527743466947e-01,1.395290556225499690e-01,1.374780604935658956e-01,1.346506483347080507e-01,1.280550646990075425e-01,1.248504215497622527e-01,1.178248061901537996e-01,1.298443201619972898e-01,1.553180115989083732e-01,1.580724143044860419e-01,1.784962367422186780e-01,1.907025124594779186e-01,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan
#2 nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,1.029349857154064074e-01,9.448919637788849579e-02,1.059611529861169132e-01,1.123315418475298866e-01,1.044427274454799576e-01,1.201363996329007783e-01,1.282688456719490722e-01,1.251468493081038524e-01,1.305904505950917782e-01,1.166948019212366294e-01,1.099250506785318382e-01,1.136641770357243175e-01,1.130515076243375772e-01,1.184654413023679964e-01,1.445082878208643895e-01,1.663965434098903795e-01,1.663395733842590318e-01,1.752476275152526075e-01,1.685796922638230499e-01,1.482366311004082449e-01,1.309908022384465853e-01,1.261424559469170870e-01,1.268358150633545067e-01,1.255352810594060065e-01,1.259829554332418666e-01,1.289792505226832475e-01,1.297540150693830274e-01,1.209480533761810861e-01,1.285694058734546119e-01,1.369298058593048373e-01,1.461700389952401702e-01,1.431042116739904002e-01,1.712214395634834019e-01,1.818925300859868255e-01,2.010257021882600748e-01,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan
#3 nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,9.861446120242163549e-02,9.304676318969960780e-02,9.864122376278822157e-02,1.075597393605647739e-01,1.131419975961711483e-01,1.146375133556569031e-01,1.204342658911874697e-01,1.228412754806565282e-01,1.240924670494341492e-01,1.163476394083799020e-01,1.073797480686657368e-01,1.017817224886293226e-01,1.131027905414023760e-01,1.114406335131803705e-01,1.227824308916071611e-01,1.329011478552513670e-01,1.441114715371090704e-01,1.604792748573601047e-01,1.527513461191236099e-01,1.380147589010027598e-01,1.288032806310404343e-01,1.338005227090968141e-01,1.255554854466473802e-01,1.173452604805394900e-01,1.143985402480809654e-01,1.202454679138123678e-01,1.267178125929230847e-01,1.241315491837501339e-01,1.347653795894559747e-01,1.349437732217280139e-01,1.301418957926175068e-01,1.313508293861232468e-01,1.742619338497762571e-01,1.858488867892321983e-01,1.877861224975270471e-01,1.803044688712685528e-01,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan
#4 nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,8.736886296542736852e-02,8.908654375958220684e-02,8.620668521597033007e-02,9.500020858506200150e-02,1.126136404574935440e-01,1.187951696788534656e-01,1.168147013779436694e-01,1.109278058355442492e-01,1.128276541805584010e-01,1.173942164407098532e-01,1.152133179543410046e-01,1.111828410014303326e-01,1.192855572113103724e-01,1.157219419285210882e-01,1.051462987022579870e-01,1.042841664852307976e-01,1.263179021208208075e-01,1.543027512926945510e-01,1.531517647661817527e-01,1.370377223529097022e-01,1.217984978313198102e-01,1.340931752979427627e-01,1.274053299614930634e-01,1.206931794950223541e-01,1.149389700113669505e-01,1.083743218115938711e-01,1.135429261076744967e-01,1.224571336189042570e-01,1.316256830092336905e-01,1.296892050846524258e-01,1.220541991422918193e-01,1.251462726710364792e-01,1.475487955738131740e-01,1.8"
DT <- fread(x, fill=TRUE, na.strings="nan")
DT[, c("V0", "V1") := tstrsplit(V1, " ", fixed=TRUE)]
set(DT, which(DT[["V1"]]=="nan"),"V1", NA)
DT[, V1 := as.numeric(V1)]
cnames <- DT$V0
DT[, V0 := NULL]
DT <- transpose(DT)
DT <- na.omit(DT)
setnames(DT, names(DT), cnames)
print(head(DT))
DTmean <- DT[, lapply(.SD, mean)]
print(DTmean)
Results:
> print(head(DT))
#1 #2 #3 #4
1: 0.1225475 0.1136642 0.1017817 0.1111828
2: 0.1044849 0.1130515 0.1131028 0.1192856
3: 0.1237545 0.1184654 0.1114406 0.1157219
4: 0.1715363 0.1445083 0.1227824 0.1051463
5: 0.1922596 0.1663965 0.1329011 0.1042842
6: 0.1809633 0.1663396 0.1441115 0.1263179
> print(DTmean)
#1 #2 #3 #4
1: 0.1477638 0.1407628 0.1330294 0.1976434
This is how I solve it from the answer that #Rage provides:
library(data.table)
library(tidyverse)
txt <- fread(file = "ndvi_20180102_081439_1005_3B.txt",sep = ",")
proc_txt <- function(f) {
txt <- fread(file = f, sep = ",")
txt <- gather(txt)
txt <- na.omit(txt)
mean <- mean(txt$value)
return(mean)
}
txt_files <- list.files(path=".", pattern=".txt")
df_list <- lapply(xml_files, proc_txt)
final_df <- do.call(rbind, df_list)
The final output is a table that has one column and has mean of all values of pixel in a single file for example:
n1 <- "nan, nan, 2, 3, 4, nan"
n2 <- "nan, 1, 2, 3, nan"
n3 <- "nan, 3, 4, 5, nan"
The code above produces a table such as:
n1 3
n2 2
n3 4

Extracting numbers from column then create new row

I have a dataset which includes multiple latitude and longitude points within the same column and also has columns with additional variables, like so:
What data currently looks like
What I would like to do is extract the numbers in multiples of 2 (i.e. 144.81803494458699788 and -37.80978699721590175 then 144.8183146450259926 -37.80819285880839686) into their own rows. The new rows will also duplicate the rest of the original row from which they came i.e.
What I would like the data to look like
I'm pretty new to R hence, perhaps, what might see like a basic question to you all. Update: I've now used
new$latlongs <- str_extract_all(roadchar$X.wkt_geom, "(?>-)*[0-9]+\\.[0-9]+")
and have the numbers/latlongs extracted including the negative sign :)
You can use a loop that combines gsub and strsplit:
## The data.frame
df <- data.frame ("Polyline" = c("MultiLineString((1.1 - 1.1, 2.2 - 2.2))",
"MultiLineString((3.3 - 3.3, 4.4 - 4.4, 5.5 - 5.5))"),
t(matrix(c(LETTERS[c(1:3,24:26)]), 3,
dimnames = list(c("Char1", "Char2", "Char3")))),
stringsAsFactors = FALSE)
# Polyline Char1 Char2 Char3
# 1 MultiLineString((1.1 - 1.1, 2.2 - 2.2)) A B C
# 2 MultiLineString((3.3 - 3.3, 4.4 - 4.4, 5.5 - 6.6)) X Y Z
## Function for splitting the line
split.polyline <- function(line, df) {
## Removing the text and brackets
cleaned_line <- gsub("\\)\\)", "", gsub("MultiLineString\\(\\(", "", as.character(df$Polyline[line])))
## Splitting the line
split_line <- strsplit(cleaned_line, split = ", ")[[1]]
## Making the line into a data.frame
df_out <- data.frame("Polyline" = split_line,
matrix(rep(df[line, -1], length(split_line)),
nrow = length(split_line), byrow = TRUE,
dimnames = list(c(), names(df)[-1]))
)
return(df_out)
}
## You can use the function like this for the first row for example
df_out <- split.polyline(1, df)
# Polyline Char1 Char2 Char3
# 1 1.1 - 1.1 A B C
# 2 2.2 - 2.2 A B C
## Or loop through all the rows
for(line in 2:nrow(df)){
df_out <- rbind(df_out, split.polyline(line, df))
}
# Polyline Char1 Char2 Char3
# 1 1.1 - 1.1 A B C
# 2 2.2 - 2.2 A B C
# 3 3.3 - 3.3 X Y Z
# 4 4.4 - 4.4 X Y Z
# 5 5.5 - 5.5 X Y Z

Separate Million and Billion Data from one column

I am trying below code for separating "M" and "B" with their values in 2 different column.
I want output like this:
level 1 level 2
M 3.2 B 3.6
M 4 B 2.8
B 3.5
Input:
reve=c("M 3.2","B 3.6","B 2.8","B 3.5","M 4")
#class(reve)
data=data.frame(reve)
Here is what I have tried.
index=which(grepl("M ",data$reve)
data$reve=gsub("M ","",data$reve)
data$reve=gsub("B ","",data$reve)
data$reve=as.numeric(data$reve)
If you have a data frame you can do that with dplyr separate()
I give you an example of this:
library(dplyr)
df <- tibble(coupe = c("M 2.3", "M 4.5", "B 1"))
df %>% separate(coupe, c("MorB","Quant"), " ")
OUTPUT
# MorB Quant
# <chr> <chr>
#1 M 2.3
#2 M 4.5
#3 B 1
Hope it help you!
For counting the number of "M" rows:
df %>% separate(YourColumn, c("MorB","Quant"), " ") %>%
filter(MorB == "M") %>% nrow()
Here is a base R approach.
lst <- split(reve, substr(reve, 1, 1))
df1 <- as.data.frame(lapply(lst, `length<-`, max(lengths(lst))))
df1
# B M
#1 B 3.6 M 3.2
#2 B 2.8 M 4
#3 B 3.5 <NA>
split the vector in two by the first letter. This gives you a list with entries of unequal length. Use lapply to make the entries having the same length, i.e. append the shorter one with NAs. Call as.data.frame.
If you want to change the names, you can use setNames
setNames(df1, c("level_2", "level_1"))
In case I misunderstood your desired output, try
df1 <- data.frame(do.call(rbind, (strsplit(reve, " "))), stringsAsFactors = FALSE)
df1[] <- lapply(df1, type.convert, as.is = TRUE)
df1
# X1 X2
#1 M 3.2
#2 B 3.6
#3 B 2.8
#4 B 3.5
#5 M 4.0
I think options rooted in regex may also be helpful for these types of problems
reve=c("M 3.2","B 3.6","B 2.8","B 3.5","M 4")
data=data.frame(reve, stringsAsFactors = F) # handle your data as strings, not factors
# regex to extract M vals and B vals
mvals <- stringi::stri_extract_all_regex(data, "M+\\s[0-9]\\.[0-9]|M+\\s[0-9]")[[1]]
bvals <- stringi::stri_extract_all_regex(data, "B+\\s[0-9]\\.[0-9]|B+\\s[0-9]")[[1]]
# gluing things together into a single df
len <- max(length(mvals), length(bvals)) # find the length
data.frame(M = c(mvals, rep(NA, len - length(mvals))) # ensure vectors are the same size
,B = c(bvals, rep(NA, len - length(bvals)))) # ensure vectors are the same size
In case regex is unfamiliar, the first expression searches for "M" followed by a space, then by digits 0 through 9, then a period, then digits 0 through 9 again. The vertical pipe is on "or" operator, so the expression also searches for "M" followed by a space, then digits 0 through 9. The second half of the expression accounts for cases like "M 4". The second expression does the same thing, just for lines that contain "B" in lieu of "M".
These are quick and dirty regex statements. I'm sure cleaner formulations are possible to get the same results.
We can count Millions or Billions as follows:
Input datatset:
reve=c("M 3.2","B 3.6","B 2.8","B 3.5","M 4")
data=data.frame(reve)
Code
library(dplyr)
library(tidyr)
data %>%
separate(reve, c("Label", "Value"),extra = "merge") %>%
group_by(Label) %>%
summarise(n = n())
Output
# A tibble: 2 x 2
Label n
<chr> <int>
1 B 3
2 M 2

Group dataframes by columns and match by n elements

So here is my issue. I have two dataframes. A simplified version of them is below.
df1
ID String
1.1 a
1.1 a
1.1 b
1.1 c
...
1.2 a
1.2 a
1.2 c
1.2 c
...
2.1 a
2.1 n
2.1 o
2.1 o
...
2.2 a
2.2 n
2.2 n
2.2 o
...
3.1 a
3.1 a
3.1 x
3.1 x
...
3.2 a
3.2 x
3.2 a
3.2 x
...
4.1 a
4.1 b
4.1 o
4.1 o
...
4.2 a
4.2 b
4.2 b
4.2 o
Imagine each ID (ex: 1.1) has over 1000 rows. Another thing to take note is that in the cases of IDs with same number (ex: 1.1 and 1.2) are very similar. But not an exact match to one another.
df2
string2
a
b
a
c
The df2 is a test df.
I want to see which of the df1 ID is the closest match to df2. But I have one very important condition. I want to match by n elements. Not the whole dataframe against the other.
My pseudo code for this:
df2-elements-to-match <- df2$string2[1:n] #only the first n elements
group df1 by ID
df1-elements-to-match <- df1$String[1:n of every ID] #only the first n elements of each ID
Output a column with score of how many matches.
Filter df1 to remove ID groups with < m score. #m here could be any number.
Filtered df1 becomes new df1.
n <- n+1
df2-elements-to-match and df1-elements-to-match both slide down to the next n elements. Overlap is optional. (ex: if first was 1:2, then 3:4 or even 2:3 and then 3:4)
Reiterate loop with updated variables
If one ID remains stop loop.
The idea here is to get a predicted match without having to match the whole test dataframe.
## minimal dfs
df1 <- data.frame(ID=c(rep(1.1, 5),
rep(1.2, 6),
rep(1.3, 3)),
str=unlist(strsplit("aabaaaabcababc", "")), stringsAsFactors=F)
df2 <- data.frame(str=c("a", "b", "a", "b"), stringsAsFactors=F)
## functions
distance <- function(df, query.df, df.col, query.df.col) {
deviating <- df[, df.col] != query.df[, query.df.col]
sum(deviating, na.rm=TRUE) # if too few rows, there will be NA, ignore NA
}
distances <- function(dfs, query.df, dfs.col, query.df.col) {
sapply(dfs, function(df) distance(df, query.df, dfs.col, query.df.col))
}
orderedDistances <- function(dfs, query.df, dfs.col, query.df.col) {
dists <- distances(dfs, query.df, dfs.col, query.df.col)
sort(dists)
}
orderByDistance <- function(dfs, query.df, dfs.col, query.df.col, dfs.split.col) {
dfs.split <- split(dfs, dfs[, dfs.split.col])
dfs.split.N <- lapply(dfs.split, function(df) df[1:nrow(query.df), ])
orderedDistances(dfs.split.N, query.df, dfs.col, query.df.col)
}
orderByDistance(df1, df2, "str", "str", "ID")
# 1.3 1.1 1.2
# 1 3 3
# 1.3 is the closest to df2!
Your problem is kind of a Distance problem.
Minimalizing Distance = finding most similar sequence.
This kind of distance I show here, assumes that at equivalent positions between df2 and sub-df of df1, deviations are counted as 1 and equality as 0. The sum gives the unsimilarity-score between the compared data frames - sequences of strings.
orderByDistance takes dfs (df1) and a query df (df2), and the columns which should be compared, and column by which it should be split dfs (here "ID").
First it splits dfs, then it collects N rows of each sub-df (preparation for comparison), and then it applies orderedDistances on each sub.df with ensured N rows (N=number or rows of query df).

Need help getting summary statistics for R data frame

This is my data (imagine I have 1050 rows of data shown below)
ID_one ID_two parameterX
111 aaa 23
222 bbb 54
444 ccc 39
My code then will divide the rows into groups of 100 (there will be 10 groups of 100 rows).
I then want to get the summary statistics per group. (not working)
After that I want to place the summary statistics in a data frame to plot them.
For example, put all 10 means for parameterX in a dataframe together, put all 10 std dev for parameterX in the same a data frame together etc
The following code is not working:
#assume data is available
dataframe_size <- nrow(thedata)
group_size <- 100
number_ofgroups <- round(dataframe_size / group_size)
#splitdata into groups of 100
split_dataframe_into_groups <- function(x,y)
0:(x-1) %% y
list1 <- split(thedata, split_dataframe_into_groups(nrow(thedata), group_size))
#print data in the first group
list1[[1]]$parameterX
#NOT WORKING!!! #get summary stat for all 10 groups
# how to loop through all 10 groups?
list1_stat <- do.call(data.frame, list(mean = apply(list1[[1]]$parameterX, 2, mean),
sd = apply(list1[[1]]$parameterX, 2, sd). . .))
the error message is always:
Error in apply(...) dim(x) must have a positive length
That makes NO sense because when I run this code, There is clearly a positive length (data exists)
#print data in the first group
list1[[1]]$parameterX
#how to put all means in a dataframe?
# how to put all standard deviations in the same dataframe
ex df1 <- mean(2,2,3,4,7,2,4,,9,8,9),
sd (0.1, 3 , 0.5, . . .)
dplyr is so good for this kind of thing. If you create a new column that assigns a 'group' ID based on row location, then you can summarize each group very easily. I use an index to assist in assigning group IDs.
install.packages('dplyr')
library(dplyr)
## Create index
df$index <- 1:nrow(df)
## Assign group labels
df$group <- paste("Group", substr(df$index, 1, 1), sep = " ")
df[df$index <= 100, 'group'] <- "Group 0"
df[df$index > 1000, 'group'] <- paste("Group", substr(df$index, 1, 2), sep = " ")
df[df$index > 10000, 'group'] <- paste("Group", substr(df$index, 1, 3), sep = " ")
## Get summaries
df <- group_by(df, group)
summaries <- summarise(df, avg = mean(parameterX),
minimum = min(parameterX),
maximum = max(parameterX),
med = median(parameterX),
Mode = mode(parameterX))
... and so on.
Hope this helps.
I think this might be a good place to use tapply. there is an excellent summary here! One path forward might be an extension of the below:
df <- data.frame(id= c(rep("AA",10),rep("BB",10)), x=runif(20))
do.call("rbind", tapply(df$x, df$id, summary))
I think this is what you want :
require(dplyr)
dt<-rbind(iris,iris,iris)
dataframe_size <- nrow(dt)
group_size <- 100
number_ofgroups <- round(dataframe_size / group_size)
df<-dt %>%
# Creating the "bins" column using mutate
mutate(bins=cut(seq(1:dataframe_size),breaks=number_ofgroups)) %>%
# Aggregating the summary statistics by the bins variable
group_by(bins) %>%
# Calculating the mean
summarise(mean.Sepal.Length = mean( Sepal.Length))
head(dt)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
df
bins mean.Sepal.Length
(fctr) (dbl)
1 (0.551,113] 5.597345
2 (113,226] 5.755357
3 (226,338] 5.919643
4 (338,450] 6.100885

Resources