Related
I have a very large df with a column that contains the file directory for each row's data.
Example: D:Mouse_2174/experiment/13/trialsummary.txt.1
I would like to create 2 new columns, one with only the mouse ID (2174) and one with the session number (13). There will be different IDs and session numbers based on the row.
I've used sub as recommended here (match part of names in data.frame to new column), but only can get the subject column to say "D:Mouse_2174" I've added an additional line and can get it down to "D:Mous2174"
Is there a way to eliminate all chars before _ and after / to obtain mouse ID?
For session number, I'm not quite as sure what to do with multiple / in the directory name.
percent_correct_list$mouse_id <- sub("/.+", "", percent_correct_list$rn)
#gives me D:Mouse_2174
percent_correct_list$mouse_id <- sub("+._", "", percent_correct_list$mouse_id)
#gives me D:Mous2174
Here is sample code for the directories:
df <- data.frame(
rn = c("D:Mouse_2174/iti_intervals/9/trialsummary.txt.1",
"D:Mouse_2181/iti_intervals/33/trialsummary.txt.1",
"D:Mouse_2183/iti_intervals/107/trialsummary.txt.2",
"D:Mouse_2185/iti_intervals/87/trialsummary.txt.1")
)
What I want:
rn
id
session
D:..
2174
9
D:..
2181
33
D:..
2183
107
D:..
2185
87
Maybe there's some way to do this earlier along in the process too (like when I import all the data into a df using lapply - but this is good as well)
For sure isnt an elegant solution. Only works if your ID and Session are always numbers...
df <- data.frame(
rn = c("D:Mouse_2174/iti_intervals/9/trialsummary.txt.1",
"D:Mouse_2181/iti_intervals/33/trialsummary.txt.1",
"D:Mouse_2183/iti_intervals/107/trialsummary.txt.2",
"D:Mouse_2185/iti_intervals/87/trialsummary.txt.1")) %>%
# Extract all numeric values from the string
mutate(allnums = regmatches(rn, gregexpr("+[[:digit:]]+", rn)))%>%
# Separate them
separate(allnums, into = c("id", "session", "idk"), sep = "\\,") %>%
# Extract them individually
mutate(id = as.numeric(regmatches(id, gregexpr("+[[:digit:]]+", id,))),
session = as.numeric(regmatches(session, gregexpr("+[[:digit:]]+", session)))) %>%
select(-idk)
Output:
1 D:Mouse_2174/iti_intervals/9/trialsummary.txt.1 2174 9
2 D:Mouse_2181/iti_intervals/33/trialsummary.txt.1 2181 33
3 D:Mouse_2183/iti_intervals/107/trialsummary.txt.2 2183 107
4 D:Mouse_2185/iti_intervals/87/trialsummary.txt.1 2185 87
Here's a somewhat long-winded solution, using tidyr::separate. Perhaps there is something more concise/elegant.
It does assume that all values of rn take the same format.
library(dplyr)
library(tidyr)
new_df <- df %>%
# separate on / into 4 new columns
separate(rn, into = c(paste0("item", 1:4)), sep = "/", remove = FALSE) %>%
# remove unwanted columns
select(-item2, -item4) %>%
# separate again on _ into 2 new columns
separate(item1, sep = "_", into = c("prefix", "id")) %>%
# retain and rename desired columns
select(rn, id, session = item3)
Result:
rn id session
1 D:Mouse_2174/iti_intervals/9/trialsummary.txt.1 2174 9
2 D:Mouse_2181/iti_intervals/33/trialsummary.txt.1 2181 33
3 D:Mouse_2183/iti_intervals/107/trialsummary.txt.2 2183 107
4 D:Mouse_2185/iti_intervals/87/trialsummary.txt.1 2185 87
I have a file separated by semicolons in which one of the variables of type character contains semicolon inside it. The readr::read_csv2 function splits the contents of those variables that have semicolons into more columns, messing up the formatting of the file.
For example, when using read_csv2 to open the file below, Bill's age column will show jogging, not 41.
File:
name;hobbies;age
Jon;cooking;38
Bill;karate;jogging;41
Maria;fishing;32
Considering that the original file doesn't contain quotes around the character type variables, how can I import the file so that karate and jogging belong in the hobbies column?
read.csv()
You can use the read.csv() function. But there would be some warning messages (or use suppressWarnings() to wrap around the read.csv() function). If you wish to avoid warning messages, using the scan() method in the next section.
library(dplyr)
read.csv("./path/to/your/file.csv", sep = ";",
col.names = c("name", "hobbies", "age", "X4")) %>%
mutate(hobbies = ifelse(is.na(X4), hobbies, paste0(hobbies, ";" ,age)),
age = ifelse(is.na(X4), age, X4)) %>%
select(-X4)
scan() file
You can first scan() the CSV file as a character vector first, then split the string with pattern ; and change it into a dataframe. After that, do some mutate() to identify your target column and remove unnecessary columns. Finally, use the first row as the column name.
library(tidyverse)
library(janitor)
semicolon_file <- scan(file = "./path/to/your/file.csv", character())
semicolon_df <- data.frame(str_split(semicolon_file, ";", simplify = T))
semicolon_df %>%
mutate(X4 = na_if(X4, ""),
X2 = ifelse(is.na(X4), X2, paste0(X2, ";" ,X3)),
X3 = ifelse(is.na(X4), X3, X4)) %>%
select(-X4) %>%
janitor::row_to_names(row_number = 1)
Output
name hobbies age
2 Jon cooking 38
3 Bill karate;jogging 41
4 Maria fishing 32
Assuming that you have the columns name and age with a single entry per observation and hobbies with possible multiple entries the following approach works:
read in the file line by line instead of treating it as a table:
tmp <- readLines(con <- file("table.csv"))
close(con)
Find the position of the separator in every row. The entry before the first separator is the name the entry after the last is the age:
separator_pos <- gregexpr(";", tmp)
name <- character(length(tmp) - 1)
age <- integer(length(tmp) - 1)
hobbies <- vector(length=length(tmp) - 1, "list")
fill the three elements using a for loop:
# the first line are the colnames
for(line in 2:length(tmp)){
# from the beginning of the row to the first";"
name[line-1] <- strtrim(tmp[line], separator_pos[[line]][1] -1)
# between the first ";" and the last ";".
# Every ";" is a different elemet of the list
hobbies[line-1] <- strsplit(substr(tmp[line], separator_pos[[line]][1] +1,
separator_pos[[line]][length(separator_pos[[line]])]-1),";")
#after the last ";", must be an integer
age[line-1] <- as.integer(substr(tmp[line],separator_pos[[line]][length(separator_pos[[line]])]+1,
nchar(tmp[line])))
}
Create a separate matrix to hold the hobbies and fill it rowwise:
hobbies_matrix <- matrix(NA_character_, nrow = length(hobbies), ncol = max(lengths(hobbies)))
for(line in 1:length(hobbies))
hobbies_matrix[line,1:length(hobbies[[line]])] <- hobbies[[line]]
Add all variable to a data.frame:
df <- data.frame(name = name, hobbies = hobbies_matrix, age = age)
> df
name hobbies.1 hobbies.2 age
1 Jon cooking <NA> 38
2 Bill karate jogging 41
3 Maria fishing <NA> 32
You could also do:
read.csv(text=gsub('(^[^;]+);|;([^;]+$)', '\\1,\\2', readLines('file.csv')))
name hobbies age
1 Jon cooking 38
2 Bill karate;jogging 41
3 Maria fishing 32
Ideally you'd ask whoever generated the file to do it properly next time :) but of course this is not always possible.
Easiest way is probably to read the lines from the file into a character vector, then clean up and make a data frame by string matching.
library(readr)
library(dplyr)
library(stringr)
# skip header, add it later
dataset <- read_lines("your_file.csv", skip = 1)
dataset_df <- data.frame(name = str_match(dataset, "^(.*?);")[, 2],
hobbies = str_match(dataset, ";(.*?);\\d")[, 2],
age = as.numeric(str_match(dataset, ";(\\d+)$")[, 2]))
Result:
name hobbies age
1 Jon cooking 38
2 Bill karate;jogging 41
3 Maria fishing 32
Using the file created in the Note at the end
1) read.pattern can read this by specifying the pattern as a regular expression with the portions within parentheses representing the fields.
library(gsubfn)
read.pattern("hobbies.csv", pattern = '^(.*?);(.*);(.*)$', header = TRUE)
## name hobbies age
## 1 Jon cooking 38
## 2 Bill karate;jogging 41
## 3 Maria fishing 32
2) Base R Using base R we can read in the lines, put quotes around the middle field and then read it in normally.
L <- "hobbies.csv" |>
readLines() |>
sub(pattern = ';(.*);', replacement = ';"\\1";')
read.csv2(text = L)
## name hobbies age
## 1 Jon cooking 38
## 2 Bill karate;jogging 41
## 3 Maria fishing 32
Note
Lines <- "name;hobbies;age
Jon;cooking;38
Bill;karate;jogging;41
Maria;fishing;32
"
cat(Lines, file = "hobbies.csv")
Looking for advice on refining my code and also trimming to a date range.
The spreadsheet itself is pulled from another system and so the structure of the excel cannot be changed. When you pull the data it basically starts at E2, with the first date column in F2, and the first item in E3. The data will continue to populate to the right for as long as it goes on for. I have replicated the structure below.
AndI want it to look like:
I have come up with the below, which works, but I was looking for advice on refining it down to fewer individual step by steps.
In the below code:
= extracting data
= pulling the dates out
= formatting from
excel number to an actual date
= grabbing the item names
= transposing data and skipping some parts
= adding in dates to the row names
#1
df <- data.frame(read_excel("C:/example.xlsx",
sheet = "Sheet1"))
#2
dfdate <- gtb[1, -c(1,2,3,4,5)]
#3
dfdate <- format(as.Date(as.numeric(dfdate),
origin = "1899-12-30"), "%d/%m/%Y")
#4
rownames(gtb) <- gtb[,1]
#5
gtb <- as.data.frame(t(gtb[, -c(1,2,3,4,5)]))
#6
rownames(gtb) <- dfdate
After the row names have been added the structure is such that I am happy to start creating the visuals where needed.
thanks for your advice
David
Here is one suggestion, I don't really have easy access to your data, but I am including code to remove those columns as you do, based on their names, which can be nicer than removing by index.
df <- read.table( text=
"Item_Code 01/01/2018 01/02/2018 01/03/2018 01/04/2018
Item 99 51 60 69
Item2 42 47 88 2
Item3 36 81 42 48
",header=TRUE, check.names=FALSE) %>%
rename( `Item Code` = Item_Code )
library(tibble)
library(lubridate)
x <- df %>% select( -matches("Code \\d|Internal Code") ) %>%
column_to_rownames("Item Code") %>%
t %>% as.data.frame %>%
rownames_to_column("Item Code") %>%
mutate( `Item Code` = dmy(`Item Code`) )
x
Output:
> x
Item Code Item Item2 Item3
1 2018-01-01 99 42 36
2 2018-02-01 51 47 81
3 2018-03-01 60 88 42
4 2018-04-01 69 2 48
I went a bit forth and back with this solution, but it can be nice to also showcase how to remove columns by a regex on their column names, since you are removing several similarly named columns.
The t trick, that you also use, works becuase there is really only one more column there that would cause problems with this, as others have commented, and this can be temporarily stowed away as rownames. If that weren't the case, you're looking at a more complex solution involving pivot_wider and pivot_longer or splitting the data.frame and transposing only one of the halves.
I'm beginner dealing with R and working with strings.
I've been trying to remove periods from data but unfortunately I can't find a solution.
This is the data I'm working on in a dataframe df:
df <- read.table(text = " n mesAno receita
97 1/2009 3.812.819.062,06
98 2/2009 4.039.362.599,36
99 3/2009 3.652.885.587,18
100 4/2009 3.460.247.960,02
101 5/2009 3.465.677.403,12
102 6/2009 3.131.903.622,55
103 7/2009 3.204.983.361,46
104 8/2009 3.811.786.009,24
105 9/2009 3.180.864.095,05
106 10/2009 3.352.535.553,88
107 11/2009 5.214.148.756,95
108 12/2009 4.491.795.201,50
109 1/2010 4.333.557.619,30
110 2/2010 4.808.488.277,86
111 3/2010 4.039.347.179,81
112 4/2010 3.867.676.530,69
113 5/2010 6.356.164.873,94
114 6/2010 3.961.793.391,19
115 7/2010 3797656130.81
116 8/2010 4709949715.37
117 9/2010 4047436592.12
118 10/2010 3923484635.28
119 11/2010 4821729985.03
120 12/2010 5024757038.22",
header = TRUE,
stringsAsFactors = TRUE)
My objective is to transform receita column to numeric as it's is being stored as factor. But applying conversion functions like as.numeric(as.factor(x)) does not work in the interval 97:114 (it coerces to NA's).
I suppose that this is because of the periods separating billion/million/thousands in this column.
The mentioned conversion functions will work only if I have something like 3812819062.06 as in 115:120.
I tried mutating the dataset adding another column and modelling.
I don't really know if what i'm doing is fine, but i also tried extracting the anomalous numbers to a variable, and applying sub/gsub on them but without success.
Is there some straight forward way of doing this, that is, instruct it to remove the 2 first occurrences of '.' and then replace the comma with a '.'?
I'm very confident that the function i'm needing is gsub but i'm having a hard time finding the correct usage. Any help will be appreciated.
Edit: My approach using dplyr::mutate(). Ugly but works.
df <- df %>%
mutate(receita_temp = receita) %>%
mutate(dot_count = str_count(receita, '\\.')) %>%
mutate(receita_temp = ifelse(dot_count == 3,
gsub('\\.', '', as.factor(receita_temp)),
gsub('\\,', '.',as.factor(receita_temp))
)) %>%
mutate(receita_temp = ifelse(dot_count == 3,
gsub('\\,', '.',as.factor(receita_temp)),
receita_temp)) %>%
select(-c(dot_count, receita)) %>%
rename(., receita = receita_temp)
I'm using regex and some stringr functions to remove all the periods except those followed by two digits and the end of the string. That way, periods denoting separation like in 3.811.786.009,24 are removed, but periods denoting the start of a decimal like in 4821729985.03 are not. Using str_remove_all rather than str_remove lets me not have to worry about removing the matches repeatedly or about how well it will scale. Then replace the remaining commas with periods, and make it numeric.
library(tidyverse)
df2 <- df %>%
mutate(receita = str_remove_all(receita, "\\.(?!\\d{2,}$)") %>%
str_replace_all(",", ".") %>%
as.numeric())
print(head(df2), digits = 12)
#> n mesAno receita
#> 1 97 1/2009 3812819062.06
#> 2 98 2/2009 4039362599.36
#> 3 99 3/2009 3652885587.18
#> 4 100 4/2009 3460247960.02
#> 5 101 5/2009 3465677403.12
#> 6 102 6/2009 3131903622.55
Created on 2018-09-04 by the reprex package (v0.2.0).
You can use the following:
first create a function that will be used for replacement:
repl = function(x)setNames(c("","."),c(".",","))[x]
This function takes in either "." or "," and returns "" or '.' respectively
Now use this function to replace
stringr::str_replace_all(as.character(df[,3]), "[.](?!\\d+$)|,", repl)
[1] "3812819062.06" "4039362599.36" "3652885587.18" "3460247960.02" "3465677403.12" "3131903622.55"
[7] "3204983361.46" "3811786009.24" "3180864095.05" "3352535553.88" "5214148756.95" "4491795201.50"
[13] "4333557619.30" "4808488277.86" "4039347179.81" "3867676530.69" "6356164873.94" "3961793391.19"
[19] "3797656130.81" "4709949715.37" "4047436592.12" "3923484635.28" "4821729985.03" "5024757038.22"
Of course you can do the rest. ie calling as.numeric() etc.
To do this in base R:
sub(',','.',gsub('[.](?!\\d+$)','',as.character(df[,3]),perl=T))
or If you know the exact number of . and , in your data, you could do
a = as.character(df[,3])
regmatches(a,gregexpr('[.](?!\\d+$)|,',df[,3],perl = T)) = list(c("","","","."))
a
df$num <- as.numeric(sapply(as.character(si), function(x) gsub("\\,","\\.",ifelse(grepl("\\,", x), gsub("\\.","",x),x))))
should do the trick.
First, the function searches for rows with ",", removes "." in these rows, and last it converts all occurring "," into ".", so that it can be converted without problems to numeric.
Use print(df$num, digits = 12) to see your data with 2 decimals.
I am handling customer data that has customer first and last name. I want to clean the names of any random keystrokes. Test accounts are jumbled in the data-set and have junk names. For example in the below data I want to remove customers 2,5,9,10,12 etc. I would appreciate your help.
Customer Id FirstName LastName
1 MARY MEYER
2 GFRTYUIO UHBVYY
3 CHARLES BEAL
4 MARNI MONTANEZ
5 GDTDTTD DTTHDTHTHTHD
6 TIFFANY BAYLESS
7 CATHRYN JONES
8 TINA CUNNINGHAM
9 FGCYFCGCGFC FGCGFCHGHG
10 ADDHJSDLG DHGAHG
11 WALTER FINN
12 GFCTFCGCFGC CG GFCGFCGFCGF
13 ASDASDASD AASDASDASD
14 TYKTYKYTKTY YTKTYKTYK
15 HFHFHF HAVE
16 REBECCA CROSSWHITE
17 GHSGHG HGASGH
18 JESSICA TREMBLEY
19 GFRTYUIO UHBVYY
20 HUBHGBUHBUH YTVYVFYVYFFV
21 HEATHER WYRICK
22 JASON SPLICHAL
23 RUSTY OWENS
24 DUSTIN WILLIAMS
25 GFCGFCFGCGFC GRCGFXFGDGF
26 QWQWQW QWQWWW
27 LIWNDVLIHWDV LIAENVLIHEAV
28 DARLENE SHORTRIDGE
29 BETH HDHDHDH
30 ROBERT SHIELDS
31 GHERDHBXFH DFHFDHDFH
32 ACE TESSSSSRT
33 ALLISON AWTREY
34 UYGUGVHGVGHVG HGHGVUYYU
35 HCJHV FHJSEFHSIEHF
The problem seems to be that you'd need a solid definition of improbable names, and that is not really related to R. Anyway, I suggest you go by the first names and remove all those names that are not plausible. As a source of plausible first names or positive list, you could use e.g. SSA Baby Name Database. This should work reasonably well to filter out English first names. If you have more location specific needs for first names, just look online for other baby name databases and try to scrape them as a positive list.
Once you have them in a vector named positiveNames, filter out all non-positive names like this:
data_new <- data_original[!data_original$firstName %in% positiveNames,]
My approach is the following:
1) Merge FirstName and LastName into a single string, strname.
Then, count the number of letters for each strname.
2) At this point, we find that for real names, like "MARNIMONTANEZ", are composed of two 'M'; two 'A'; one 'R'; one 'I'; three 'N'; one 'O'; one 'T'.
And we find that fake names, like "GFCTFCGCFGCCGGFCGFCGFCGF", are composed of six 'G'; five 'F'; 8 'C'.
3) The pattern to distinguish real names from fake names becomes clear:
real names are characterized by a more variety of letters. We can measure this by creating a variable check_real computed as: number of unique letters / total string length
fake names are characterized by few letters repeated several times. We can measure this by creating a variable check_fake computed as: average frequency of each letter
4) Finally, we just have to define a threshold to identify an anomaly for both variable. In the cases where these threshold are triggered, a flag_real and a flag_fake appears.
if flag_real == 1 & flag_fake == 0, the name is real
if flag_real == 0 & flag_fake == 1, the name is fake
In the rare cases when the two flags agrees (i.e. flag_real == 1 & flag_fake == 1), you have to investigate the record manually to optimize the threshold.
You can calculate variability strength of full name (combine FirstName and LastName) by calculating length of unique letters in full name divided by total number of characters in the full name. Then, just remove the names that has low variability strength. This means that you are removing the names that has a high frequency of same random keystrokes resulting in low variability strength.
I did this using charToRaw function because it very faster and using dplyr library, as below:
# Building Test Data
df <- data.frame(CustomerId = c(1, 2, 3, 4, 5, 6, 7),
FirstName = c("MARY", "FGCYFCGCGFC", "GFCTFCGCFGC", "ASDASDASD", "GDTDTTD", "WALTER", "GFCTFCGCFGC"),
LastName = c("MEYER", "FGCGFCHGHG", "GFCGFCGFCGF", "AASDASDASD", "DTTHDTHTHTHD", "FINN", "CG GFCGFCGFCGF"), stringsAsFactors = FALSE)
#test data: df
# CustomerId FirstName LastName
#1 1 MARY MEYER
#2 2 FGCYFCGCGFC FGCGFCHGHG
#3 3 GFCTFCGCFGC GFCGFCGFCGF
#4 4 ASDASDASD AASDASDASD
#5 5 GDTDTTD DTTHDTHTHTHD
#6 6 WALTER FINN
#7 7 GFCTFCGCFGC CG GFCGFCGFCGF
library(dplyr)
df %>%
## Combining FirstName and LastName
mutate(FullName = paste(FirstName, gsub(" ", "", LastName, fixed = TRUE))) %>%
group_by(FullName) %>%
## Calculating variability strength for each full name
mutate(Variability = length(unique(as.integer(charToRaw(FullName))))/nchar(FullName))%>%
## Filtering full name, I set above or equal to 0.4 (You can change this)
## Meaning we are keeping full name that has variability strength greater than or equal to 0.40
filter(Variability >= 0.40)
# A tibble: 2 x 5
# Groups: FullName [2]
# CustomerId FirstName LastName FullName Variability
# <dbl> <chr> <chr> <chr> <dbl>
#1 1 MARY MEYER MARY MEYER 0.6000000
#2 6 WALTER FINN WALTER FINN 0.9090909
I tried to combine the suggestions in the below code. Thanks everyone for the help.
# load required libraries
library(hunspell)
library(dplyr)
# read data in dataframe df
df<-data.frame(CustomerId = c(1, 2, 3, 4, 5, 6, 7,8),
FirstName = c("MARY"," ALBERT SAM", "FGCYFCGCGFC", "GFCTFCGCFGC", "ASDASDASD", "GDTDTTD", "WALTER", "GFCTFCGCFGC"),
LastName = c("MEYER","TEST", "FGCGFCHGHG", "GFCGFCGFCGF", "AASDASDASD", "DTTHDTHTHTHD", "FINN", "CG GFCGFCGFCGF"), stringsAsFactors = FALSE)
# Keep unique names
df<-distinct(df,FirstName, LastName, .keep_all = TRUE)
# Spell check using hunspel
df$flag <- hunspell_check(df$FirstName) | hunspell_check(as.character(df$LastName))
# remove middle names
df$FirstNameOnly<-gsub(" .*","",df$FirstName)
# SSA name data using https://www.ssa.gov/oact/babynames/names.zip
# unzip files in folder named names
files<-list.files("/names",pattern="*.txt")
ssa_names<- do.call(rbind, lapply(files, function(x) read.csv(x,
col.names = c("Name","Gender","Frequency"),stringsAsFactors = FALSE)))
# Change SSA names to uppercase
ssa_names$Name <- toupper(ssa_names$Name)
# Flad for SSA names
df$flag_SSA<-ifelse(df$FirstNameOnly %in% ssa_names$Name,TRUE,FALSE)
rm(ssa_names)
# remove spaces and concatenate first name and last name
df$strname<-gsub(" ","",paste(df$FirstName,df$LastName, sep = ""))
# Name string length
df$len<-nchar(df$strname)
# Unique string length
for(n in 1:nrow(df))
{
df$ulen[n]<-length(unique(strsplit(df$strname[n], "")[[1]]))
}
# Ratio variable for unique string length over total string length
df$ratio<-ifelse(df$len==0,0,df$ulen/df$len)
# Histogram to determine cutoff ratio
hist(df$ratio)
test<-df[df$ratio<.4 & df$flag_SSA==FALSE & df$flag==FALSE,]