Insert/Update R data.table into PostgreSQL table - r

I have a PostgreSQL database set up with a table and columns already defined. The primary key for the table is a combination of (Id, datetime) column. I need to periodically INSERT data for different Ids from R data.table into the database. However, if data for a particular (Id, datetime) combination already exists it should be UPDATED (overwritten). How can I do this using RPostgres or RPostgreSQL packages?
When I try to insert a data.table where some (Id, datetime) rows already exist I get an error saying the primary key constraint is violated:
dbWriteTable(con, table, dt, append = TRUE, row.names = FALSE)
Error in connection_copy_data(conn#ptr, sql, value) :
COPY returned error: ERROR: duplicate key value violates unique constraint "interval_data_pkey"
DETAIL: Key (id, dttm_utc)=(a0za000000CSdLoAAL, 2018-10-01 05:15:00+00) already exists.
CONTEXT: COPY interval_data, line 1

You can use my pg package that has upsert functionality, or just grab code for upsert from there: https://github.com/jangorecki/pg/blob/master/R/pg.R#L249
It is basically what others said in comments. Write data into temp table and then insert into destination table using on conflict clause.
pgSendUpsert = function(stage_name, name, conflict_by, on_conflict = "DO NOTHING", techstamp = TRUE, conn = getOption("pg.conn"), .log = getOption("pg.log",TRUE)){
stopifnot(!is.null(conn), is.logical(.log), is.logical(techstamp), is.character(on_conflict), length(on_conflict)==1L)
cols = pgListFields(stage_name)
cols = setdiff(cols, c("run_id","r_timestamp")) # remove techstamp to have clean column list, as the fresh one will be used, if any
# sql
insert_into = sprintf("INSERT INTO %s.%s (%s)", name[1L], name[2L], paste(if(techstamp) c(cols, c("run_id","r_timestamp")) else cols, collapse=", "))
select = sprintf("SELECT %s", paste(cols, collapse=", "))
if(techstamp) select = sprintf("%s, %s::INTEGER run_id, '%s'::TIMESTAMPTZ r_timestamp", select, get_run_id(), format(Sys.time(), "%Y-%m-%d %H:%M:%OS"))
from = sprintf("FROM %s.%s", stage_name[1L], stage_name[2L])
if(!missing(conflict_by)) on_conflict = paste(paste0("(",paste(conflict_by, collapse=", "),")"), on_conflict)
on_conflict = paste("ON CONFLICT",on_conflict)
sql = paste0(paste(insert_into, select, from, on_conflict), ";")
pgSendQuery(sql, conn = conn, .log = .log)
}
#' #rdname pg
pgUpsertTable = function(name, value, conflict_by, on_conflict = "DO NOTHING", stage_name, techstamp = TRUE, conn = getOption("pg.conn"), .log = getOption("pg.log",TRUE)){
stopifnot(!is.null(conn), is.logical(.log), is.logical(techstamp), is.character(on_conflict), length(on_conflict)==1L)
name = schema_table(name)
if(!missing(stage_name)){
stage_name = schema_table(stage_name)
drop_stage = FALSE
} else {
stage_name = name
stage_name[2L] = paste("tmp", stage_name[2L], sep="_")
drop_stage = TRUE
}
if(pgExistsTable(stage_name)) pgTruncateTable(name = stage_name, conn = conn, .log = .log)
pgWriteTable(name = stage_name, value = value, techstamp = techstamp, conn = conn, .log = .log)
on.exit(if(drop_stage) pgDropTable(stage_name, conn = conn, .log = .log))
pgSendUpsert(stage_name = stage_name, name = name, conflict_by = conflict_by, on_conflict = on_conflict, techstamp = techstamp, conn = conn, .log = .log)
}

Related

Db2 on Cloud: Problem with column in querying from R

I created a connection between R and Db2 on Cloud
library(RODBC)
dsn_driver <- "{IBM DB2 ODBC Driver}"
dsn_database <- "bludb" # e.g. "bludb"
dsn_hostname <- "**"
dsn_port <- "***" # e.g. "32733"
dsn_protocol <- "TCPIP" # i.e. "TCPIP"
dsn_uid <- "**" #
dsn_pwd <- "**" #
dsn_security <- "ssl"
conn_path <- paste("DRIVER=",dsn_driver,
";DATABASE=",dsn_database,
";HOSTNAME=",dsn_hostname,
";PORT=",dsn_port,
";PROTOCOL=",dsn_protocol,
";UID=",dsn_uid,
";PWD=",dsn_pwd,
";SECURITY=",dsn_security,
sep="")
conn <- odbcDriverConnect(conn_path)
conn
Then I created the table
myschema <- "**" #
tables <- c("Annual_Crop")
for (table in tables){
# Drop School table if it already exists
out <- sqlTables(conn, tableType = "TABLE", schema = myschema, tableName =table)
if (nrow(out)>0) {
err <- sqlDrop (conn, paste(myschema,".",table,sep=""), errors=FALSE)
if (err==-1){
cat("An error has occurred.\n")
err.msg <- odbcGetErrMsg(conn)
for (error in err.msg) {
cat(error,"\n")
}
} else {
cat ("Table: ", myschema,".",table," was dropped\n")
}
} else {
cat ("Table: ", myschema,".",table," does not exist\n")
}
}
df1 <- sqlQuery (conn, "CREATE TABLE Annual_Crop(
CD_ID char (6) NOT NULL,
YEAR CHAR (20),
CROP_TYPE varchar (50),
GEO varchar (50),
SEEDED_AREA CHAR (50) ,
HARVESTED_AREA CHAR (50),
PRODUCTION CHAR (50),
AVG_YIELD CHAR (50),
PRIMARY KEY (CD_ID))",
errors = FALSE)
if(df1 == -1){
cat ("An error has occured.\n")
msg <- odbcGetErrMsg(conn)
print (msg)
} else {
cat ("Table was createdd successfuly.\n")
}
I loaded the dataset from a file into the table
anual_cropdf <- read.csv("/resources/labs/MYDATA/data1.csv")
sqlSave(conn, anual_cropdf, 'Annual_Crop', append=TRUE, fast=FALSE, rownames=FALSE, colnames=FALSE, verbose=FALSE)
Then I tried to fetch from the table and it works
FARMDB <- sqlFetch(conn, "Annual_Crop")
tail(FARMDB)
Finally, when I want to perform a query, it was not working. The result was just the name of columns 0X8
info <- paste('select * from Annual_Crop
where Geo = 41600')
query <- sqlQuery(conn,info,believeNRows = FALSE)
query
Why?
Based on your table schema, the data type for Geo is VARCHAR. Have you tried a query like this?
select * from Annual_Crop where Geo = 'Alberta'
or
select * from Annual_Crop where Geo = '41600'
Varchar / string needs to use single quotes for the value.

How to query LOB field in Oracle using R

How can I query a LOB field from Oracle using ROracle library?
library(ROracle)
drv <- dbDriver("Oracle")
connect.string <- paste(
"(DESCRIPTION=",
"(ADDRESS=(PROTOCOL=TCP)(HOST=", host, ")(PORT=", PORT, "))",
"(CONNECT_DATA=(GLOBAL_NAME=",GLOBAL_NAME,")(SID=", SID, ")))", sep = "")
con <- ROracle::dbConnect(drv, username = username, password = password, dbname = connect.string)
I am a newbie and tried this:
rs <- dbSendQuery(con, "SELECT UTL_ENCODE.BASE64_ENCODE(CAST(LOB_FIELD AS RAW)) FROM TABLEDATA WHERE OTHER_FIELD = 'something'")
data <- ROracle::fetch(rs)
# Error in .oci.SendQuery(conn, statement, data = data, prefetch = prefetch, :
ORA-00906: missing left parenthesis
rs <- dbSendQuery(con, "SELECT dbms_lob.READ(LOB_FIELD) FROM TABLEDATA WHERE OTHER_FIELD = 'something'")
data <- ROracle::fetch(rs)
#Error in .oci.SendQuery(conn, statement, data = data, prefetch = prefetch, :
ORA-00904: "DBMS_LOB"."READ": invalid identifier
Try simple with the query
select LOB_FIELD from tab where OTHER_FIELD = 'something
where LOB_FILED is the CLOB column
df <- dbGetQuery(conn, "select LOB_FIELD from tab where OTHER_FIELD = 'something'")
nchar(df$LOB_FIELD)
[1] 68000

How to include / exclude filter statement in R httr query for Localytics

I can successfully query data from Localytics using R, such as the following example:
r <- POST(url = "https://api.localytics.com/v1/query,
body=list(app_id=<APP_ID>,
metrics=c("occurrences","users"),
dimensions=c('a:URI'),
conditions=list(day = c("between", "2020-02-11", "2020-03-12"),
event_name = "Content Viewed",
"a:Item URI" = "testing")
),
encode="json",
authenticate(Key,Secret),
accept("application/json"),
content_type("application/json"))
stop_for_status(r)
But what I would like to do is create a function so I can do this quickly and not have to copy/paste data.
The issue I am running into is with the line "a:Item URI" = "testing", where I am filtering all searches by the Item URI where they all equal "testing", but sometimes, I don't want to include the filter statement, so I just remove that line entirely.
When I wrote my function, I tried something like the following:
get_localytics <- function(appID, metrics, dimensions, from = Sys.Date()-30,
to = Sys.Date(), eventName = "Content Viewed",
Key, Secret, filterDim = NULL, filterCriteria = NULL){
r <- httr::POST(url = "https://api.localytics.com/v1/query",
body = list(app_id = appID,
metrics = metrics,
dimensions = dimensions,
conditions = list(day = c("between", as.character(from), as.character(to)),
event_name = eventName,
filterDim = filterCriteria)
),
encode="json",
authenticate(Key, Secret),
accept("application/json"),
content_type("application/json"))
stop_for_status(r)
result <- paste(rawToChar(r$content),collapse = "")
document <- fromJSON(result)
df <- document$results
return(df)
}
But my attempt at adding filterDim and filterCriteria only produce the error Unprocessable Entity. (Keep in mind, there are lots of variables I can filter by, not just "a:Item URI" so I need to be able to manipulate that as well.
How can I include a statement, where if I need to filter, I can incorporate that line, but if I don't need to filter, that line isn't included?
conditions is just a list, so you can conditionally add elements to it. Here we just use an if statement to test of the values are passed and if so, add them in.
get_localytics <- function(appID, metrics, dimensions, from = Sys.Date()-30,
to = Sys.Date(), eventName = "Content Viewed",
Key, Secret, filterDim = NULL, filterCriteria = NULL){
conditions <- list(day = c("between", as.character(from), as.character(to)),
event_name = eventName)
if (!is.null(filterDim) & !is.null(filterCriteria)) {
conditions[[filterDim]] <- filterCriteria)
}
r <- httr::POST(url = "https://api.localytics.com/v1/query",
body = list(app_id = appID,
metrics = metrics,
dimensions = dimensions,
conditions = conditions),
encode="json",
authenticate(Key, Secret),
accept("application/json"),
content_type("application/json"))
stop_for_status(r)
result <- paste(rawToChar(r$content),collapse = "")
document <- fromJSON(result)
df <- document$results
return(df)
}

Truncated updated string with R DBI package

I need to update a wide table on an SQL SERVER from R. So the package DBI seems to be very useful for that.
The problem is that the R data.frame contains strings of more than 3000 characters and when I use the DBI dbSendQuery function, all strings are truncated to 256 characters.
Here could be a code example :
con <- odbc::dbConnect(drv = odbc::odbc(),
dsn = '***',
UID = '***',
PWD = '***')
df = data.frame(TEST = paste(rep("A", 300), collapse=""),
TEST_ID = 1068858)
df$TEST = df$TEST %>% as.character
query = paste0('UPDATE MY_TABLE SET "TEST"=? WHERE TEST_ID=?')
update <- DBI::dbSendQuery(con, query)
DBI::dbBind(update, df)
DBI::dbClearResult(update)
odbc::dbDisconnect(con)
Then the following request return 256 instead of 300 :
SELECT LEN(TEST) FROM MY_TABLE WHERE TEST_ID = 1068858
NB : TEST is of type (varchar(max), NULL) and already contains strings of more than 256 chars.
Thanks in advance for any advice
In the end, I choose to get rid of sophisticated functions. A solution was to write the table in .csv file and bulk insert it into the database. Here is an example using RODBC package :
write.table(x = df,
file = "/path/DBI_error_test.csv",
sep = ";",
row.names = FALSE, col.names = FALSE,
na = "NULL",
quote = FALSE)
Query = paste("CREATE TABLE #MY_TABLE_TMP (
TEST varchar(max),
TEST_ID int
);
BULK INSERT #MY_TABLE_TMP
FROM 'C:\\DBI_error_test.csv'
WITH
(
FIELDTERMINATOR = ';',
ROWTERMINATOR = '\n',
BATCHSIZE = 500000,
CHECK_CONSTRAINTS
)
UPDATE R
SET R.TEST = #MY_TABLE_TMP.TEST
FROM MY_TABLE AS R
INNER JOIN #MY_TABLE_TMP ON #MY_TABLE_TMP.TEST_ID = R.TEST_ID;
DROP TABLE #MY_TABLE_TMP;
")
channel <- RODBC::odbcConnect(dsn = .DB_DSN_NAME,
uid = .DB_UID,
pwd = .DB_PWD)
RODBC::sqlQuery(channel = channel, query = query, believeNRows = FALSE)
RODBC::odbcClose(channel = channel)

Encountered "ORA-01745: invalid host/bind variable name" when using DbDataAdapter.Update() with ODP.NET

I have a table defined in oracle 11g with below statement:
CREATE TABLE "TESTUSER"."TestTableOracleWriter"
("name" VARCHAR2(100 BYTE),
"group" VARCHAR2(100 BYTE),
"number" NUMBER(*,0),
"creation" DATE,
"sliceidentifier" RAW(100),
CONSTRAINT "TESTTABLEORACLEWRITER_PK" PRIMARY KEY ("name"))
And I am using the following code snippet to update the table with content in the dataTable:
private void BatchInsert(DbConnection connection, DbTransaction transaction, DataTable dataTable, string tableName)
{
DbDataAdapter adapter = ProviderFactories.GetFactory("Oracle.DataAccess.Client").CreateDataAdapter();
DbCommand insertCommand = connection.CreateCommand();
DbParameter parameter1 = insertCommand.CreateParameter();
parameter.DbType = DbType.String;
parameter.ParameterName = "#name";
parameter.SourceColumn = "name";
insertCommand.Parameters.Add(parameter);
DbParameter parameter2 = insertCommand.CreateParameter();
parameter2.DbType = DbType.String;
parameter2.ParameterName = "#group";
parameter2.SourceColumn = "group";
insertCommand.Parameters.Add(parameter2);
DbParameter parameter3 = insertCommand.CreateParameter();
parameter3.DbType = DbType.Int32;
parameter3.ParameterName = "#number";
parameter3.SourceColumn = "number";
insertCommand.Parameters.Add(parameter3);
DbParameter parameter4 = insertCommand.CreateParameter();
parameter4.DbType = DbType.DateTime;
parameter4.ParameterName = "#creation";
parameter4.SourceColumn = "creation";
insertCommand.Parameters.Add(parameter4);
insertCommand.CommandType = CommandType.Text;
insertCommand.CommandText = "INSERT INTO \"TestTableOracleWriter\" (\"name\", \"group\", \"number\", \"creation\") VALUES (:name, :group, :number, :creation)";
insertCommand.Transaction = transaction;
insertCommand.UpdatedRowSource = UpdateRowSource.None;
adapter.InsertCommand = insertCommand;
adapter.UpdateBatchSize = 0;
adapter.Update(dataTable);
}
But sometimes the code will fail with "ORA-01745: invalid host/bind variable name", I've searched on the internet and found some materials saying it has something to do with the oracle reserve word. From the link, "name", "group" and "number" is marked as reserve word. I can change my table column names to make the code work.
But the strangest thing is that the code does not fail all the time, it only fails when dataTable cotains only one row, in other scenarios, it works as expected. Anyone has ideas about that?
You can not use key word as parameter name.
Don't use group and number as parameter name

Resources