Optimizing a Dynamic 7-Day Cohort Firebase BigQuery - firebase

I wrote the following query below against our mobile app's data. Due to a high user-base, I am getting a 400 request error "Resources exceeded during query execution: The query could not be executed in the allotted memory" when I add the ORDER BY at the bottom.
Question: Is there anything that I can do to optimize the query, but still retain the ORDER BY at the bottom?
I already added in the firebase's demo data-set, but I think their data-set is just too small to have a problem (compared to my data-set which is 5-10 million records big).
SELECT
f.user_pseudo_id,
f.event_timestamp,
DATE(TIMESTAMP_MICROS(f.event_timestamp)) as event_timestamp_date,
f.event_name,
f.user_first_touch_timestamp,
DATE(TIMESTAMP_MICROS(f.user_first_touch_timestamp)) as user_first_touch_date,
CASE WHEN r.has_appRemove >= 1 THEN "removed" ELSE "not-removed" END AS status_after_first7days
FROM `firebase-analytics-sample-data.ios_dataset.app_events_*` f
LEFT JOIN (
SELECT user_pseudo_id, 1 has_appRemove
FROM `firebase-analytics-sample-data.ios_dataset.app_events_*`
WHERE DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)) >= DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY)
AND DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)) < DATE_SUB(CURRENT_DATE(), INTERVAL 9 DAY)
AND _TABLE_SUFFIX >= FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY))
AND _TABLE_SUFFIX < FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 3 DAY))
AND platform = "ANDROID"
AND event_name = "app_remove"
GROUP BY user_pseudo_id
) r on f.user_pseudo_id = r.user_pseudo_id
WHERE
DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)) >= DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY)
AND DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)) < DATE_SUB(CURRENT_DATE(), INTERVAL 9 DAY)
AND _TABLE_SUFFIX >= FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY))
AND _TABLE_SUFFIX < FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 3 DAY))
AND platform = "ANDROID"
ORDER BY 1,2 ASC

You can apply windowing/analytical function instead of join'ing - like in below example (not tested)
#standardSQL
SELECT
user_pseudo_id,
event_timestamp,
DATE(TIMESTAMP_MICROS(event_timestamp)) AS event_timestamp_date,
event_name,
user_first_touch_timestamp,
DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)) AS user_first_touch_date,
COUNTIF(event_name = "app_remove") OVER(PARTITION BY user_pseudo_id) > 0 isRemoved
FROM `firebase-analytics-sample-data.ios_dataset.app_events_*`
WHERE
DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)) >= DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY)
AND DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)) < DATE_SUB(CURRENT_DATE(), INTERVAL 9 DAY)
AND _TABLE_SUFFIX >= FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY))
AND _TABLE_SUFFIX < FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 3 DAY))
AND platform = "ANDROID"
ORDER BY 1,2 ASC

Related

Firebase BigQuery Export Campaign Performance

I'm having a hard time replicating the metrics found when accessing in the Analytics console. Particularly user and session metrics when split by campaign details (extracted from 'campaign_details' event and attributed on a last interaction basis over 90 days). If I query the data without considering campaign_details my values are as in the console. I'd be interested to know if anyone has worked with this previously and managed to get the data as in the console or if it's even possible to expect parity?
with initial_prep as (
SELECT
(select max(value.string_value) from unnest(user_properties) where key='store') store,
device.operating_system as operating_system,
event_date,
user_pseudo_id,
event_timestamp,
TIMESTAMP_MICROS(event_timestamp) AS ts,
LAG(TIMESTAMP_MICROS(event_timestamp)) OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp) AS prev_evt_ts,
IF(event_name = "session_start", 1, 0) AS is_session_start_event,
IF(event_name = "first_open", 1, 0) AS is_first_visit_event,
IF(event_name = "screen_view", 1, 0) AS is_screen_view,
IF(event_name = "purchase",1,0) as is_purchase,
ecommerce.purchase_revenue as value,
ecommerce.shipping_value as shipping,
ecommerce.total_item_quantity as quantity,
FROM
`[PROJECT DETAILS REDACTED].events_20*`
WHERE
parse_date('%y%m%d', _table_suffix) between DATE_sub(current_date(), interval 1 day) and DATE_sub(current_date(), interval 1 day)
and
device.operating_system = 'IOS'
), user_sources as (
select
user_pseudo_id,
TIMESTAMP_MICROS(event_timestamp) AS ts,
(select max(value.string_value) from unnest(event_params) where key='source' and event_name in( 'campaign_details')) source,
(select max(value.string_value) from unnest(event_params) where key='campaign' and event_name in( 'campaign_details')) campaign
from
`[PROJECT DETAILS REDACTED].events_20*`
WHERE
parse_date('%y%m%d', _table_suffix) between DATE_sub(current_date(), interval 90 day) and DATE_sub(current_date(), interval 1 day)
and
event_name in ('campaign_details')
)
, session_id_created as (
SELECT
*,
SUM(is_session_start_event) OVER (PARTITION BY user_pseudo_id ORDER BY ts) AS session_id
FROM initial_prep
)
, session_details as (
SELECT
si.user_pseudo_id,
store,
operating_system,
event_date,
event_timestamp,
session_id,
MAX(is_session_start_event) OVER (PARTITION BY si.user_pseudo_id, session_id) AS has_session_start_event,
is_session_start_event,
MAX(is_first_visit_event) OVER (PARTITION BY si.user_pseudo_id, session_id) AS has_first_visit_event,
is_first_visit_event,
is_screen_view,
MAX(event_timestamp) OVER (PARTITION BY si.user_pseudo_id, session_id) AS max_timestamp,
MIN(event_timestamp) OVER (PARTITION BY si.user_pseudo_id, session_id) AS min_timestamp,
is_purchase,
value,
shipping,
quantity,
us.source,
us.campaign,
row_number() over (partition by si.user_pseudo_id, event_timestamp order by us.ts desc) rank,
us.ts time_campaign
from session_id_created si
left join user_sources us on us.user_pseudo_id = si.user_pseudo_id and si.ts >= us.ts -->= timestamp_sub(us.ts,interval 3600000 MICROSECOND)
)
, session_fin as (
select user_pseudo_id,
store,
operating_system,
source,
campaign,
event_date,
session_id,
has_session_start_event,
has_first_visit_event,
max_timestamp,
min_timestamp,
sum(is_session_start_event) sessions_alt,
sum(is_screen_view) screen_views,
sum(value) revenue,
sum(is_purchase) transactions,
sum(shipping) shipping,
sum(quantity) item_quantity
from session_details
where rank =1
group by
user_pseudo_id,
store,
event_date,
operating_system,
session_id,
source,
campaign,
has_session_start_event,
has_first_visit_event,
max_timestamp,
min_timestamp
)
select store
, operating_system
, source
, campaign
, event_date applicabledate
, sum(sessions_alt) sessions
, sum(transactions) transactions
, sum(revenue) local_revenue
, sum(shipping) local_shipping
, sum(item_quantity) item_quantity
, avg(max_timestamp/100000 - min_timestamp/100000) avgsessionduration
, count(distinct user_pseudo_id) users
, count(distinct case when has_first_visit_event = 1 then user_pseudo_id end) new_users
, sum(screen_views) screenviews
from session_fin
group by store, event_date, operating_system
, source
, campaign
order by users desc
/**/

How can fill the missing dates in google BigQuery

I want to write a chart that shows the active users in firebase
I wrote this code
SELECT event_date, COUNT(DISTINCT user_pseudo_id) AS user_count
FROM `mark-3314e.analytics_197261162.events_*`
WHERE _TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)) AND FORMAT_DATE('%Y%m%d', CURRENT_DATE())
AND event_name = 'session_start'
GROUP BY event_date
ORDER BY event_date ASC
And this is the response
Row event_date user_count
1 20190617 1
2 20190621 3
is there any way to fill the missing dates between 21 and 17 with the previous data? like:
event_date user_count
20190617 1
20190618 1
20190619 1
20190620 1
20190621 3
You may join with a calendar table which contains the full date range of interest:
WITH dates AS (
SELECT '20190617' AS dt UNION ALL
SELECT '20190618' UNION ALL
SELECT '20190619' UNION ALL
SELECT '20190620' UNION ALL
SELECT '20190621'
)
SELECT
t1.dt AS event_date,
COUNT(DISTINCT t2.user_pseudo_id) AS user_count
FROM dates t1
LEFT JOIN `mark-3314e.analytics_197261162.events_*` t2
ON t1.dt = t2.event_date AND
t2._TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)) AND FORMAT_DATE('%Y%m%d', CURRENT_DATE())
AND t2.event_name = 'session_start'
GROUP BY
t1.dt
ORDER BY
t1.dt;
For a more general way to generate a date range in BigQuery, see this SO question.
Here is a possible solution using GENERATE_DATE_ARRAY function in BigQuery.
with data as (
SELECT parse_date('%Y%m%d', event_date) AS event_date, COUNT(DISTINCT user_pseudo_id) AS user_count
FROM `mark-3314e.analytics_197261162.events_*`
WHERE _TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)) AND FORMAT_DATE('%Y%m%d', CURRENT_DATE())
AND event_name = 'session_start'
GROUP BY event_date
ORDER BY event_date ASC
)
select dt as event_date, user_count from (
select user_count,
if(
previousdate is null,
generate_date_array(date, date_sub(nextdate, interval 1 day), interval 1 day),
generate_date_array(date, if(nextdate is null, date, date_sub(nextdate, interval 1 day)), interval 1 day)
) as dates
from (
select
lag(event_date) over(order by event_date) as previousdate,
event_date as date,
lead(event_date) over(order by event_date) as nextdate,
user_count
from data
)
), unnest(dates) dt

Translating Legacy BigQuery to Standard and getting error "Table name "s3" cannot be resolved: dataset name is missing."

The error is on line 4 and s3.fullVisitorId is underlined. I imagine it is going to throw an error on each of the selected items in that group though.
I tried adding a SELECT statement after DISTINCT and the inline error goes away but when I try to run the query I get the error: "Table name "s3" cannot be resolved: dataset name is missing."
Any help would be appreciated. I am finding translating queries from legacy sql to standard is a bit of a headache.
Here is the full query:
#standardSQL
SELECT
CAST(CONCAT(SUBSTR(date,1,4),'-',SUBSTR(date,5,2),'-',SUBSTR(date,7,2)) AS DATE) AS Date,
COUNT(DISTINCT s3.fullVisitorId) AS users,
COUNT(s0.firstHit) AS carts,
COUNT(s1.firstHit) AS order_details,
COUNT(s2.firstHit) AS order_confirmation
FROM (
SELECT
IFNULL(s3.date,
IFNULL(s0.date,
IFNULL(s1.date,
s2.date))) AS date,
s3.fullVisitorId,
s0.fullVisitorId,
s0.visitId,
s0.firstHit,
s1.firstHit,
s2.firstHit
FROM (
# user subquery
SELECT
date,
fullVisitorId,
visitId
FROM
`big-query-project-34643.162968675.ga_sessions_*`
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 365 DAY))
AND FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 0 DAY))
AND totals.visits = 1
GROUP BY
date,
fullVisitorId,
visitId) s3
FULL OUTER JOIN
((
# first subquery
SELECT
date,
fullVisitorId,
visitId,
MIN(h.hitNumber) AS firstHit
FROM
`big-query-project-34643.162968675.ga_sessions_*`, unnest(hits) as h
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 365 DAY))
AND FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 0 DAY))
AND REGEXP_CONTAINS(h.page.pagePath,
'/cart')
AND totals.visits = 1
GROUP BY
date,
fullVisitorId,
visitId)) s0
ON
s3.fullVisitorId = s0.fullVisitorId
AND s3.visitId = s0.visitId
FULL OUTER JOIN
((
# Second Subquery
SELECT
date,
fullVisitorId,
visitId,
MIN(h.hitNumber) AS firstHit
FROM
`big-query-project-34643.162968675.ga_sessions_*`, unnest(hits) as h
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 365 DAY))
AND FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 0 DAY))
AND REGEXP_CONTAINS(h.page.pagePath,
'/order-details')
AND totals.visits = 1
GROUP BY
date,
fullVisitorId,
visitId)) s1
ON
s0.fullVisitorId = s1.fullVisitorId
AND s0.visitId = s1.visitId
FULL OUTER JOIN
((
# Third Subquery
SELECT
date,
fullVisitorId,
visitId,
MIN(h.hitNumber) AS firstHit
FROM
`big-query-project-34643.162968675.ga_sessions_*`, unnest(hits) as h
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 365 DAY))
AND FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 0 DAY))
AND REGEXP_CONTAINS(h.page.pagePath,
'/orderconfirmation')
AND totals.visits = 1
GROUP BY
date,
fullVisitorId,
visitId)) s2
ON
s1.fullVisitorId = s2.fullVisitorId
AND s1.visitId = s2.visitId)
GROUP BY
date
ORDER BY
date
You have multiple issues here - notably: in lines 14-19
s3.fullVisitorId,
s0.fullVisitorId,
s0.visitId,
s0.firstHit,
s1.firstHit,
s2.firstHit
I would recommend to at least provide aliases like [for example] below
s3.fullVisitorId s3_fullVisitorId,
s0.fullVisitorId s0_fullVisitorId,
s0.visitId,
s0.firstHit s0_firstHit,
s1.firstHit s1_firstHit,
s2.firstHit s2_firstHit
and then in lines 4-7 - reference the alias
COUNT(DISTINCT s3_fullVisitorId) AS users,
COUNT(s0_firstHit) AS carts,
COUNT(s1_firstHit) AS order_details,
COUNT(s2_firstHit) AS order_confirmation

DAU/MAU Google data studio

I am trying to plot DAU/MAU in google data studio but when I try to create a formula it always says invalid formula.
Additionally, if I try to insert a scorecard with DAU it always fails.
Please advice.
I am afraid you may be coming up against the fact that GA data does not present to GDS as a single data set with all columns in, but rather as different purpose subsets which can't all be joined together. What you could do (as recommended by my son who does this stuff on a daily basis) would be to use the GA plugin in Google Sheets, use formulas there to get the data as you want it, and then point GDS at your Google Sheet data.
To create a scorecard for daily or monthly actives:
Click the pencil next to the metric and make sure it's set to Sum
Set the default date range to Custom and set the interval to Yesterday.
You might also find this article interesting. It shows how to use Google Sheets to combine the DAU / MAU stats.
You can write the below query on the data studio when you link among firebase, bigquery, and data studio;
SELECT
(
SELECT count(distinct user_pseudo_id) as count
FROM `projectName.events_*` AS A
WHERE A._TABLE_SUFFIX = CONCAT( SUBSTR(CAST(DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 DAY) AS string), 0 , 4),
SUBSTR(CAST(DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 DAY) AS string), 6 , 2),
SUBSTR(CAST(DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 DAY) AS string), 9 , 2))
AND PARSE_DATE('%Y%m%d', event_date) = DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 DAY)
)AS DAU,
(
SELECT count(distinct user_pseudo_id) as count
FROM `projectName.events_*` AS A
WHERE A._TABLE_SUFFIX BETWEEN CONCAT( SUBSTR(CAST(DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 WEEK) AS string), 0 , 4),
SUBSTR(CAST(DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 WEEK) AS string), 6 , 2),
SUBSTR(CAST(DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 WEEK) AS string), 9 , 2))
AND CONCAT( SUBSTR(CAST(DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 DAY) AS string), 0 , 4),
SUBSTR(CAST(DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 DAY) AS string), 6 , 2),
SUBSTR(CAST(DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 DAY) AS string), 9 , 2))
AND PARSE_DATE('%Y%m%d', event_date) BETWEEN DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 WEEK)
AND DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 DAY)
)AS WAU,
(
SELECT count(distinct user_pseudo_id) as count
FROM `projectName.events_*` AS A
WHERE A._TABLE_SUFFIX BETWEEN CONCAT( SUBSTR(CAST(DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 MONTH) AS string), 0 , 4),
SUBSTR(CAST(DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 MONTH) AS string), 6 , 2),
SUBSTR(CAST(DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 MONTH) AS string), 9 , 2))
AND CONCAT( SUBSTR(CAST(DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 DAY) AS string), 0 , 4),
SUBSTR(CAST(DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 DAY) AS string), 6 , 2),
SUBSTR(CAST(DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 DAY) AS string), 9 , 2))
AND PARSE_DATE('%Y%m%d', event_date) BETWEEN DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 MONTH)
AND DATE_SUB(PARSE_DATE('%Y%m%d', #DS_END_DATE), INTERVAL 1 DAY)
)AS MAU

Big Query landing page figures not consistent with Google Analytics interface

I'm using BigQuery to report on Google Analytics data. I'm trying to recreate landing page data using BigQuery.
The following query reports 18% fewer sessions than in the Google Analytics interface:
SELECT DISTINCT
fullVisitorId,
visitID,
h.page.pagePath AS LandingPage
FROM
`project-name.dataset.ga_sessions_*`, UNNEST(hits) AS h
WHERE
hitNumber = 1
AND h.type = 'PAGE'
AND _TABLE_SUFFIX BETWEEN '20170331' AND '20170331'
ORDER BY fullVisitorId DESC
Where am I going wrong with my approach? Why can't I get to within a small margin of the number in the GA interface's reported figure?
Multiple reasons :
1.Big Query for equivalent landing page:
SELECT
LandingPage,
COUNT(sessionId) AS Sessions,
100 * SUM(totals.bounces)/COUNT(sessionId) AS BounceRate,
AVG(totals.pageviews) AS AvgPageviews,
SUM(totals.timeOnSite)/COUNT(sessionId) AS AvgTimeOnSite,
from(
SELECT
CONCAT(fullVisitorId,STRING(visitId)) AS sessionID,
totals.bounces,
totals.pageviews,
totals.timeOnSite,
hits.page.pagePath AS landingPage
FROM (
SELECT
fullVisitorId,
visitId,
hits.page.pagePath,
totals.bounces,
totals.pageviews,
totals.timeOnSite,
MIN(hits.hitNumber) WITHIN RECORD AS firstHit,
hits.hitNumber AS hitNumber
FROM (TABLE_DATE_RANGE ([XXXYYYZZZ.ga_sessions_],TIMESTAMP('2016-08-01'), TIMESTAMP ('2016-08-31')))
WHERE
hits.type = 'PAGE'
AND hits.page.pagePath'')
WHERE
hitNumber = firstHit)
GROUP BY
LandingPage
ORDER BY
Sessions DESC,
LandingPage
Next :
Pre-calculated data -- pre-aggregated tables
These are the precalculated data that Google uses to speed up the UI. Google does not specify when this is done but it can be at any point of the time. These are known as pre-aggregated tables
So if you compare the numbers from GA UI to your Big Query output, you will always see a discrepancy. Please go ahead and rely on your big query data .
You can achieve the same thing by simply adding the below to your select statement:
,(SELECT page.pagePath FROM UNNEST(hits) WHERE hitnumber = (SELECT MIN(hitnumber) FROM UNNEST(hits) WHERE type = 'PAGE')) landingpage
I can get a 1 to 1 match with the GA UI on my end when I run something like below, which is a bit more concise than the original answer:
SELECT DISTINCT
a.landingpage
,COUNT(DISTINCT(a.sessionId)) sessions
,SUM(a.bounces) bounces
,AVG(a.avg_pages) avg_pages
,(SUM(tos)/COUNT(DISTINCT(a.sessionId)))/60 session_duration
FROM
(
SELECT DISTINCT
CONCAT(CAST(fullVisitorId AS STRING),CAST(visitStartTime AS STRING)) sessionId
,(SELECT page.pagePath FROM UNNEST(hits) WHERE hitnumber = (SELECT MIN(hitnumber) FROM UNNEST(hits) WHERE type = 'PAGE')) landingpage
,totals.bounces bounces
,totals.timeonsite tos
,(SELECT COUNT(hitnumber) FROM UNNEST(hits) WHERE type = 'PAGE') avg_pages
FROM `tablename_*`
WHERE _TABLE_SUFFIX >= '20180801'
AND _TABLE_SUFFIX <= '20180808'
AND totals.visits = 1
) a
GROUP BY 1
another way here! you can get the same number :
SELECT
LandingPage,
COUNT(DISTINCT(sessionID)) AS sessions
FROM(
SELECT
CONCAT(fullVisitorId,CAST(visitId AS STRING)) AS sessionID,
FIRST_VALUE(hits.page.pagePath) OVER (PARTITION BY CONCAT(fullVisitorId,CAST(visitId AS STRING)) ORDER BY hits.hitNumber ASC ) AS LandingPage
FROM
`xxxxxxxx1.ga_sessions_*`,
UNNEST(hits) AS hits
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
AND FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
AND hits.type ='PAGE'
GROUP BY fullVisitorId, visitId, sessionID,hits.page.pagePath,hits.hitNumber
)
GROUP BY LandingPage
ORDER BY sessions DESC
There is a hit.isEntrance field in the schema that can be used for this purpose.
The example below would show you yesterday's landing pages:
#standardSQL
select
date,
hits.page.pagePath as landingPage,
sum(totals.visits) as visits,
sum(totals.bounces) as bounces,
sum(totals.transactions) as transactions
from
`project.dataset.ga_sessions_*`,
unnest(hits) as hits
where
(_table_suffix
between format_date("%Y%m%d", date_sub(current_date(), interval 1 day))
and format_date("%Y%m%d", date_sub(current_date(), interval 1 day)))
and hits.isEntrance = True
and totals.visits = 1 #avoid counting midnight-split sessions
group by
1, 2
order by 3 desc
There is still one source of discrepancy though, which comes from the sessions without a landing page (if you check in GA in the landing pages report, there will sometimes be a (not set) value.
In order to include those as well, you can do:
with
landing_pages_set as (
select
concat(cast(fullVisitorId as string), cast(visitId as string), cast(date as string)) as fullVisitId,
hits.page.pagePath as virtualPagePath
from
`project.dataset.ga_sessions_*`,
unnest(hits) as hits
where
(_table_suffix
between format_date("%Y%m%d", date_sub(current_date(), interval 1 day))
and format_date("%Y%m%d", date_sub(current_date(), interval 1 day)))
and totals.visits = 1 #avoid counting midnight-split sessions
and hits.isEntrance = TRUE
group by 1, 2
),
landing_pages_not_set as (
select
concat(cast(fullVisitorId as string), cast(visitId as string), cast(date as string)) as fullVisitId,
date,
"(not set)" as virtualPagePath,
count(distinct concat(cast(fullVisitorId as string), cast(visitId as string), cast(date as string))) as visits,
sum(totals.bounces) as bounces,
sum(totals.transactions) as transactions
from
`project.dataset.ga_sessions_*`
where
(_table_suffix
between format_date("%Y%m%d", date_sub(current_date(), interval 1 day))
and format_date("%Y%m%d", date_sub(current_date(), interval 1 day)))
and totals.visits = 1 #avoid counting midnight-split sessions
group by 1, 2, 3
),
landing_pages as (
select
l.fullVisitId as fullVisitId,
date,
coalesce(r.virtualPagePath, l.virtualPagePath) as virtualPagePath,
visits,
bounces,
transactions
from
landing_pages_not_set l left join landing_pages_set r on l.fullVisitId = r.fullVisitId
)
select virtualPagePath, sum(visits) from landing_pages group by 1 order by 2 desc

Resources