I am attempting to use queryparser to extract table relationships from an SQL query. I can get most of what I need, I'm just having issues unpacking the lists.
library(queryparser)
file <- "select
p.name,
p.age,
p.hometown,
c.state,
c.country,
n.capitol,
n.leader
from person p
inner join city c
on p.hometown = c.name
inner join nation n
on c.county = n.name
where C.country != 'Antarctica' "
query <- parse_query(file, tidyverse = TRUE)
query$from
yields the following lists:
> query$from
$p
person
$c
city
$n
nation
attr(,"join_types")
[1] "inner join" "inner join"
attr(,"join_conditions")
attr(,"join_conditions")[[1]]
p.hometown == c.name
attr(,"join_conditions")[[2]]
c.county == n.name
I would like to have a datafame that has each table name and it's alias, and a second table with the join criteria. What is the easiest way to do this dynamically so that I don't have to adjust code between scanning different scripts?
Convert to character and then use stack. For the info in the attributes remove the names and simplify giving the character matrix shown.
stack(sapply(query$from, as.character))
## values ind
## 1 person p
## 2 city c
## 3 nation n
simplify2array(attributes(query$from)[-1])
## join_types join_conditions
## [1,] "inner join" "p.hometown == c.name"
## [2,] "inner join" "c.county == n.name"
Related
Say I have two tables, Table A and Table B, and I want to compare a certain column.
For example,
Table A has the columns:
Name,Surname ,Family, species
Table B has the columns:
IP,Genes,Types,Species,Models
How do I compare the Species column between the two tables to get the matches , that means that i want to extract name of species that exist in both tables?
for exemple if the first species column have
a b c d e f g h i
information and the second species colum have
k l m n a b y i l
i want this result :
a b i
Can you tell me please the way i can do that , and also if there s anyway i can do it without usin join
Thank you very much
Try any of these options. I have used dummy data:
#Data
TableA <- data.frame(Species=c('a','b','c','d','e','f','g','h','i'),
Var=1,stringsAsFactors = F)
TableB <- data.frame(Species=c('k','l','m','n','a','b','y','i','l'),
Var2=2,stringsAsFactors = F)
#Option1
TableA$Species[TableA$Species %in% TableB$Species]
#Option 2
intersect(TableA$Species,TableB$Species)
In both cases the output will be:
[1] "a" "b" "i"
I have 2 lists with names of cities. both the lists contain exactly the same city names. I have to create city pairs out of these lists. So for example, the 1st list contains names like London, Paris, Moscow, Geneva and Tokyo. List 2 contains the exact same names. However the third list should contains pairs like "London-Paris", "London-Tokyo","London-Geneva" and so on but not "Tokyo-London" or " Paris-London" since that would be double counting. Any help either in R or excel would be appreciated.
I have tried using 'combn' function in R. However I have around 4500 observation and 'combn' function did not work for me.
(You don't really need second list to do that, one is enough)
cities <- list("London", "Paris", "Kyiv", "Geneva", "Tokyo")
combn(cities, 2, paste, collapse = "-")
# [1] "London-Paris" "London-Kyiv" "London-Geneva" "London-Tokyo" "Paris-Kyiv"
# [6] "Paris-Geneva" "Paris-Tokyo" "Kyiv-Geneva" "Kyiv-Tokyo" "Geneva-Tokyo"
Using expand.grid and then manipulating:
# create all possible combinations
df <- expand.grid(myList, myList)
# ensure only 1 combination for each pair
df <- as.data.frame(unique(t(apply(df, 1, sort))))
# remove same city combinations
df <- subset(df, df$V1 != df$V2)
# create column with pairs
df$combo <- paste0(df$V1, "-", df$V2)
You can also do it in excel. Create a new module, paste the following code, update the range in the first 2 lines of code and execute the VBA macro:
Sub combn_VBA()
'Define variables
Dim CityListSourceRange As Range: Set CityListSourceRange = Sheet1.Range("A1:A5") '<-- Replace A5:A9 with range address containing the city names
Dim CityCombinationDestinyRange As Range: Set CityCombinationDestinyRange = Sheet1.Range("C1") '<-- Replace C1 with first cell where you want to place the result list
Set CityList = CreateObject("Scripting.Dictionary")
'Copies the source cities into a collection
For Each CellX In CityListSourceRange
i = i + 1
CityList.Add CellX.Value, i
Next CellX
'Creates unique pairs
For Each City1 In CityList
For Each City2 In CityList
If CityList(City1) < CityList(City2) Then
CityCombinationDestinyRange.Offset(j, 0).Value = City1 & "-" & City2
j = j + 1
End If
Next City2
Next City1
End Sub
To obtain something like this:
For Excel - VBA:
We can use a tiny trick:
Since the the lists are identical, we can solve the problem with one list only:
Sub MakePairs()
Dim i As Long, N As Long, k As Long, j As Long
N = Cells(Rows.Count, "A").End(xlUp).Row
k = 1
For i = 1 To N - 1
For j = i + 1 To N
Cells(k, 3).Value = Cells(i, 1).Value & "-" & Cells(j, 1).Value
k = k + 1
Next j
Next i
End Sub
NOTE:
this technique avoids unwanted permutations of existing pairs
this technique avoids replicate pairs like London-London
We can try to cook our own function to find the combinations of the city names It works slightly faster, comparing to combn():
function
combn2 <- function(x){
n = length(x)
paste(
x[rep.int(seq_along(x)[-n], times = rev(seq_along(x))[-1])],
x[unlist(lapply(seq_along(x)[-1], ':', to = n))],
sep = '-'
)
}
check if results are correct
cities <- list("London", "Paris", "Kyiv", "Geneva", "Tokyo")
combn2(cities)
# [1] "London-Paris" "London-Kyiv" "London-Geneva" "London-Tokyo" "Paris-Kyiv"
# [6] "Paris-Geneva" "Paris-Tokyo" "Kyiv-Geneva" "Kyiv-Tokyo" "Geneva-Tokyo"
compare timing for combn() and combn2() for 5K cities
data
cities <- unique(maps::world.cities$name)
length(cities)
# [1] 41074
cities <- cities[1:5000]
timing for combn()
system.time(
combn(cities, 2, paste, collapse = "-")
)
# user system elapsed
# 116.02 0.01 116.33
timing for combn2()
system.time(
combn2(cities)
)
# user system elapsed
# 14.04 0.00 14.09
I think the most of the time is consumed by paste(), so if you find the way to nuclearize paste(), I would be very grateful if you let me know how you did it.
I have a dataframe "employee" like this:
Emp_Id,Name,Dept_Id
20203,Sam,1
20301,Rodd,2
30321,Mike,3
40403,Derik,4
Now i want to transform this dataframe in a way that the Dept_Id have department names instead of Dept_Id.
I am trying to use recode from dplyrfor this, since my transformation logic comes from a csv, I would have to use a variable in place of transformation logic.
I used read.csv to get my dataframe df where my logic (1=HR,2=IT and so on) sits and then get it in a list:
df:
Source,Target,Transformation
Employee,Emp,"1=HR,2=Sales,3=Finance,4=IT"
To get the transaformation login from df
myList <- as.character(df[1,3])
Now replacing the data in employee as per the logic
employee$Dept_Id <- recode(employee$Dept_Id,myList)
On this line it is giving me:
Error: Argument 2 must be named, not unnamed
There are multiple ways to do this. One way is:
Method 1:
df$Dept_Id <- name[match(df$Dept_Id, names(name))]
Emp_Id Name Dept_Id
1: 20203 Sam HR
2: 20301 Rodd IT
Method 2:
df <- df %>%
mutate(Dept_Id_2 = case_when(
Dept_Id == 1 ~ 'HR',
Dept_Id == 2 ~ 'IT'
))
Method 3:
codes <- list("1" = "HR", "2" = "IT")
df %>%
mutate(d2 = recode(Dept_Id, !!!codes))
Setup
df <- fread("
Emp_Id Name Dept_Id
20203 Sam 1
20301 Rodd 2
")
name <- c("1" = "HR", "2"="IT")
Your dataframe df has a different structure which makes it difficult to apply functions directly. We need to clean it and bring it in a better format so that it is easy to query on it.
One way to do that is to split the data on , and = to create a new dataframe (lookup) with department ID and Name.
lookup <- data.frame(t(sapply(strsplit(as.character(df[1,3]), ",")[[1]],
function(x) strsplit(x, "=")[[1]])), row.names = NULL)
lookup
# X1 X2
#1 1 HR
#2 2 Sales
#3 3 Finance
#4 4 IT
Once we have lookup then it is easy to match by ID and get corresponding name.
employee$Dept_Name <- lookup$X2[match(employee$Dept_Id, lookup$X1)]
employee
# Emp_Id Name Dept_Id Dept_Name
#1 20203 Sam 1 HR
#2 20301 Rodd 2 Sales
#3 30321 Mike 3 Finance
#4 40403 Derik 4 IT
Another way If you don't want to change your existing database and the department list is not too big.
Assumption: there is not any missing data in your "employee" database. if any missing data available then need to add one more level of condition.
ifelse is the simple way to apply your logic, I have mentioned that in below code
New_DF = ifelse(employee$Dept_Id == 1,"HR",ifelse(employee$Dept_Id == 2,"Sales",ifelse(employee$Dept_Id == 3,"Finance","IT")))
New_DF = cbind(employee,New_DF)
I asked a similar questions before but i still need some help/be pointed into the right direction.
I am trying to locate certain words within a column that consists of a SQL statement on all the rows and extract the next word in R studio.
Example: lets call this dataframe "SQL
| **UserID** | **SQL Statement**
1 | N781 | "SELECT A, B FROM Table.1 p JOIN Table.2 pv ON
p.ProdID.1ProdID.1 JOIN Table.3 v ON pv.BusID.1 =
v.BusID WHERE SubID = 1 ORDER BY v.Name;"
2 | N283 | "SELECT D, E FROM Table.11 p JOIN Table.2 pv ON
p.ProdID.1ProdID.1 JOIN Table.3 v ON pv.BusID.1 =
v.BusID WHERE SubID = 1 ORDER BY v.Name;"
So I am trying to pull out the table name. So I am trying to find the words "From" and "Join" and pulling the next table names.
I have been using some code with help from earlier:
I make the column "SQL Statement" in a list of 2 name "b"
I use the code:
z <- mapply(grepl,"(FROM|JOIN)",b)
which gives me a True and fasle for each word in each list.
z <- mapply(grep,"(FROM|JOIN)",b)
The above is close. It give me a position of every match in each of the lists.
But I am just trying to find the word Join or From and take the text word out. I was trying to get an output something like
| **UserID** | **SQL Statement** | Tables
1 | N781 | "SELECT A, B FROM Table.1 p JOIN Table.2 pv ON | Table.1, Table.2
p.ProdID.1ProdID.1 JOIN Table.3 v ON pv.BusID.1 =
v.BusID WHERE SubID = 1 ORDER BY v.Name;"
2 | N283 | "SELECT D, E FROM Table.11 p JOIN Table.2 pv ON
p.ProdID.1ProdID.1 JOIN Table.3 v ON pv.BusID.1 = | Table.11, Table.31
v.BusID WHERE SubID = 1 ORDER BY v.Name;"
Here is a working script which uses base R options. The inspiration here is to leverage strsplit to split the query string on the keywords FROM or JOIN. Then, the first separate word of each resulting term (except for the first term) should be a table name.
sql <- "SELECT A, B FROM Table.1 p JOIN Table.2 pv ON
p.ProdID.1ProdID.1 JOIN Table.3 v ON pv.BusID.1 =
v.BusID WHERE SubID = 1 ORDER BY v.Name;"
terms <- strsplit(sql, "(FROM|JOIN)\\s+")
out <- unlist(lapply(terms, function(x) gsub("^([^[:space:]]+).*", "\\1", x)))
out <- out[2:length(out)]
out
[1] "Table.1" "Table.2" "Table.3"
Demo
To understand better what I did, follow the demo and have a look at the terms list which resulted from splitting.
Edit:
Here is a link to another demo which shows how you might use the above logic on a vector of query strings, to generate a list of vector of tables, for each query
Demo
Let's say I have the following data frames and want to merge them.
df1 = data.frame(zipcoide=c(90001,90002,90003,66062,90005))
df1
df2 = data.frame(sfc_code=c(900,660,800,400,500,100,300,350,310,450))
df2
SCF Codes are apparently zipcode prefixes and I want to match the sfc_code with the zipcode.
Basically, if I'm given a list of scf codes, I want to select all those zipcodes which have that scf code.
So in this example, I want to end up with
90001
90002
90003
90005
I figure I could use the sqldf package to write a query to select based on " LIKE' %900% " but was looking for something a little more "elegant."
Thanks!
You want to return the all the zipcodes whose first 3 digits appear in your sfc_codes list:
df1[ as.numeric(substr( df1$zipcoide , 1 , 3 ) ) %in% df2$sfc_code , ]
# [1] 90001 90002 90003 66062 90005
Probably not the best example because all zip codes are in that sfc_code list!
But if we remove 660 then we get:
df2 = data.frame(sfc_code=c(900,800,400,500,100,300,350,310,450))
df1[ as.numeric(substr( df1$zipcoide , 1 , 3 ) ) %in% df2$sfc_code , ]
# [1] 90001 90002 90003 90005
When you sfc_code are always the first three digits of your zipcode you could just select the first three digits of your zipcode and match these with the sfc_codes:
df1$sfc_code <- as.numeric(substr(as.character(df1$zipcoide), 1, 3))
match(df1$sfc_code, df2$sfc_code)
Update
If as #joran commented you want to have for each sfc_code in df2 all zipcodes in df1, you could use merge (with or without all=TRUE):
# add id so that we can see which records are matched
df1$id1 <- 1:nrow(df1)
df2$id2 <- 1:nrow(df2)
merge(df2, df1)