I want to write a chart that shows the active users in firebase
I wrote this code
SELECT event_date, COUNT(DISTINCT user_pseudo_id) AS user_count
FROM `mark-3314e.analytics_197261162.events_*`
WHERE _TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)) AND FORMAT_DATE('%Y%m%d', CURRENT_DATE())
AND event_name = 'session_start'
GROUP BY event_date
ORDER BY event_date ASC
And this is the response
Row event_date user_count
1 20190617 1
2 20190621 3
is there any way to fill the missing dates between 21 and 17 with the previous data? like:
event_date user_count
20190617 1
20190618 1
20190619 1
20190620 1
20190621 3
You may join with a calendar table which contains the full date range of interest:
WITH dates AS (
SELECT '20190617' AS dt UNION ALL
SELECT '20190618' UNION ALL
SELECT '20190619' UNION ALL
SELECT '20190620' UNION ALL
SELECT '20190621'
)
SELECT
t1.dt AS event_date,
COUNT(DISTINCT t2.user_pseudo_id) AS user_count
FROM dates t1
LEFT JOIN `mark-3314e.analytics_197261162.events_*` t2
ON t1.dt = t2.event_date AND
t2._TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)) AND FORMAT_DATE('%Y%m%d', CURRENT_DATE())
AND t2.event_name = 'session_start'
GROUP BY
t1.dt
ORDER BY
t1.dt;
For a more general way to generate a date range in BigQuery, see this SO question.
Here is a possible solution using GENERATE_DATE_ARRAY function in BigQuery.
with data as (
SELECT parse_date('%Y%m%d', event_date) AS event_date, COUNT(DISTINCT user_pseudo_id) AS user_count
FROM `mark-3314e.analytics_197261162.events_*`
WHERE _TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 7 DAY)) AND FORMAT_DATE('%Y%m%d', CURRENT_DATE())
AND event_name = 'session_start'
GROUP BY event_date
ORDER BY event_date ASC
)
select dt as event_date, user_count from (
select user_count,
if(
previousdate is null,
generate_date_array(date, date_sub(nextdate, interval 1 day), interval 1 day),
generate_date_array(date, if(nextdate is null, date, date_sub(nextdate, interval 1 day)), interval 1 day)
) as dates
from (
select
lag(event_date) over(order by event_date) as previousdate,
event_date as date,
lead(event_date) over(order by event_date) as nextdate,
user_count
from data
)
), unnest(dates) dt
Related
I'm having a hard time replicating the metrics found when accessing in the Analytics console. Particularly user and session metrics when split by campaign details (extracted from 'campaign_details' event and attributed on a last interaction basis over 90 days). If I query the data without considering campaign_details my values are as in the console. I'd be interested to know if anyone has worked with this previously and managed to get the data as in the console or if it's even possible to expect parity?
with initial_prep as (
SELECT
(select max(value.string_value) from unnest(user_properties) where key='store') store,
device.operating_system as operating_system,
event_date,
user_pseudo_id,
event_timestamp,
TIMESTAMP_MICROS(event_timestamp) AS ts,
LAG(TIMESTAMP_MICROS(event_timestamp)) OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp) AS prev_evt_ts,
IF(event_name = "session_start", 1, 0) AS is_session_start_event,
IF(event_name = "first_open", 1, 0) AS is_first_visit_event,
IF(event_name = "screen_view", 1, 0) AS is_screen_view,
IF(event_name = "purchase",1,0) as is_purchase,
ecommerce.purchase_revenue as value,
ecommerce.shipping_value as shipping,
ecommerce.total_item_quantity as quantity,
FROM
`[PROJECT DETAILS REDACTED].events_20*`
WHERE
parse_date('%y%m%d', _table_suffix) between DATE_sub(current_date(), interval 1 day) and DATE_sub(current_date(), interval 1 day)
and
device.operating_system = 'IOS'
), user_sources as (
select
user_pseudo_id,
TIMESTAMP_MICROS(event_timestamp) AS ts,
(select max(value.string_value) from unnest(event_params) where key='source' and event_name in( 'campaign_details')) source,
(select max(value.string_value) from unnest(event_params) where key='campaign' and event_name in( 'campaign_details')) campaign
from
`[PROJECT DETAILS REDACTED].events_20*`
WHERE
parse_date('%y%m%d', _table_suffix) between DATE_sub(current_date(), interval 90 day) and DATE_sub(current_date(), interval 1 day)
and
event_name in ('campaign_details')
)
, session_id_created as (
SELECT
*,
SUM(is_session_start_event) OVER (PARTITION BY user_pseudo_id ORDER BY ts) AS session_id
FROM initial_prep
)
, session_details as (
SELECT
si.user_pseudo_id,
store,
operating_system,
event_date,
event_timestamp,
session_id,
MAX(is_session_start_event) OVER (PARTITION BY si.user_pseudo_id, session_id) AS has_session_start_event,
is_session_start_event,
MAX(is_first_visit_event) OVER (PARTITION BY si.user_pseudo_id, session_id) AS has_first_visit_event,
is_first_visit_event,
is_screen_view,
MAX(event_timestamp) OVER (PARTITION BY si.user_pseudo_id, session_id) AS max_timestamp,
MIN(event_timestamp) OVER (PARTITION BY si.user_pseudo_id, session_id) AS min_timestamp,
is_purchase,
value,
shipping,
quantity,
us.source,
us.campaign,
row_number() over (partition by si.user_pseudo_id, event_timestamp order by us.ts desc) rank,
us.ts time_campaign
from session_id_created si
left join user_sources us on us.user_pseudo_id = si.user_pseudo_id and si.ts >= us.ts -->= timestamp_sub(us.ts,interval 3600000 MICROSECOND)
)
, session_fin as (
select user_pseudo_id,
store,
operating_system,
source,
campaign,
event_date,
session_id,
has_session_start_event,
has_first_visit_event,
max_timestamp,
min_timestamp,
sum(is_session_start_event) sessions_alt,
sum(is_screen_view) screen_views,
sum(value) revenue,
sum(is_purchase) transactions,
sum(shipping) shipping,
sum(quantity) item_quantity
from session_details
where rank =1
group by
user_pseudo_id,
store,
event_date,
operating_system,
session_id,
source,
campaign,
has_session_start_event,
has_first_visit_event,
max_timestamp,
min_timestamp
)
select store
, operating_system
, source
, campaign
, event_date applicabledate
, sum(sessions_alt) sessions
, sum(transactions) transactions
, sum(revenue) local_revenue
, sum(shipping) local_shipping
, sum(item_quantity) item_quantity
, avg(max_timestamp/100000 - min_timestamp/100000) avgsessionduration
, count(distinct user_pseudo_id) users
, count(distinct case when has_first_visit_event = 1 then user_pseudo_id end) new_users
, sum(screen_views) screenviews
from session_fin
group by store, event_date, operating_system
, source
, campaign
order by users desc
/**/
I want to identify users that had a "first_open" event in month a (here: january) and came back to our in month b (here: february) with an "user_engagement" event.
My idea:
1. Create a table with all users who had a "first_open" event
2. Create a table with all users who had a "user_engagement" event
3. Join both tables on userID
4. Count Users who both had a "first_open" event in month a and month b and count all users from january with the "first_open" event
With the following query I am currently overcounting both the users in month a and b, because I am not counting all unqiue users for both event types.
With
users_first_open as (select
user_pseudo_id,
EXTRACT (Month FROM(DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)))) AS install_month,
event_name as firstopen
FROM
`table.events_*`
where _TABLE_SUFFIX BETWEEN '20190101'
AND '20190108' and event_name = "first_open" and
EXTRACT (Month FROM(DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)))) = 1
),
user_enagement_next_month as (select
user_pseudo_id,
EXTRACT (Month FROM(DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)))) AS engagement_month,
event_name as engagament_next_month
FROM
`table.events_*`
where _TABLE_SUFFIX BETWEEN '20190109'
AND '20190116' and event_name = "user_engagement"
and EXTRACT (Month FROM(DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)))) = 1),
cohort_raw as(
select
user_pseudo_id,
install_month,
engagement_month,
case when firstopen = "first_open" then 1 else 0 end as cohort_count_first_open,
case when engagament_next_month = "user_engagement" then 1 else 0 end as cohort_count_engagement
from
user_enagement_next_month
full join
users_first_open using (user_pseudo_id))--,
select
sum(case when cohort_count_first_open is not null then 1 else 0 end) as users_first_open,
(select sum(case when cohort_count_engagement is not null then 1 else 0 end) as u_engagement_open from cohort_raw where cohort_count_first_open = 1) as users_engagement_open
from cohort_raw
What I tried next was the following: group in table 2 "user_enagement_next_month" by userID, etc.
and create a sum of "first_open" case when and "engagement" case when results. With the later I then included the query to only count users whose count of these two was = 2
-
With
users_first_open as (select
user_pseudo_id,
EXTRACT (Month FROM(DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)))) AS install_month,
event_name as firstopen
FROM
`table.events_*`
where _TABLE_SUFFIX BETWEEN '20190101'
AND '20190131' and event_name = "first_open" and
EXTRACT (Month FROM(DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)))) = 1
),
user_enagement_next_month as (select
user_pseudo_id,
EXTRACT (Month FROM(DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)))) AS engagement_month,
event_name as engagament_next_month
FROM
`table.events_*`
where _TABLE_SUFFIX BETWEEN '20190201'
AND '20190228' and event_name = "session_start"
and EXTRACT (Month FROM(DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)))) = 2
group by 1,2,3)--,
--cohort_raw as(
select
user_pseudo_id,
install_month,
engagement_month,
case when firstopen = "first_open" then 1 else 0 end as cohort_count_first_open,
case when engagament_next_month = "session_start" then 1 else 0 end as cohort_count_engagement
--case when user_pseudo_id is not null then 1 else 0 end as cohort_count_engagement
from
user_enagement_next_month
full join
users_first_open using (user_pseudo_id)),
cohort_agg as (
select *, cohort_count_first_open+cohort_count_engagement as cohort_sum
from cohort_raw
group by 1,2,3,4,5
order by 6 desc)
select
(select count(*) from users_first_open) as cohort_jan,
(select Sum(cohort_sum) from cohort_agg where cohort_sum = 2) as ret,
sum(case when cohort_count_first_open is not null then 1 else 0 end) as users_first_open,
(select sum(case when cohort_count_engagement is not null then 1 else 0 end) as u_engagement_open from cohort_raw where cohort_count_first_open = 1) as users_engagement_open
from cohort_agg
I expect a return rate of around 20%. My output at the moment is 54%, because in my query I am either overcounting or counting to little, because I assume my join does not work.
Maybe I don't clearly understand what you want, but try this one
with
users_first_open as (
select distinct -- is there duplicates for one user_id?
user_pseudo_id,
extract(
month from
timestamp_micros(user_first_touch_timestamp)
) as install_month
from
`table.events_201901*` -- longer prefixes generally perform better
where
_table_suffix between '01' and '31'
and event_name = 'first_open'
and extract(
month from
timestamp_micros(user_first_touch_timestamp)
) = 1
),
user_enagement_next_month as (
select distinct
user_pseudo_id,
extract(
month from
timestamp_micros(user_first_touch_timestamp)
) as engagement_month
from
`table.events_201902*` -- longer prefixes generally perform better
where
_table_suffix between '01' and '28'
and event_name = 'user_engagement'
and extract(
month from
timestamp_micros(user_first_touch_timestamp)
) = 2
)
select
ufo.install_month,
uenm.engagement_month,
count(*) as first_open_event_users_cnt,
count(uenm.user_pseudo_id) as user_engagement_event_users_cnt
from
users_first_open as ufo
left join user_enagement_next_month as uenm
on ufo.user_pseudo_id = uenm.user_pseudo_id
group by
1, 2
The error is on line 4 and s3.fullVisitorId is underlined. I imagine it is going to throw an error on each of the selected items in that group though.
I tried adding a SELECT statement after DISTINCT and the inline error goes away but when I try to run the query I get the error: "Table name "s3" cannot be resolved: dataset name is missing."
Any help would be appreciated. I am finding translating queries from legacy sql to standard is a bit of a headache.
Here is the full query:
#standardSQL
SELECT
CAST(CONCAT(SUBSTR(date,1,4),'-',SUBSTR(date,5,2),'-',SUBSTR(date,7,2)) AS DATE) AS Date,
COUNT(DISTINCT s3.fullVisitorId) AS users,
COUNT(s0.firstHit) AS carts,
COUNT(s1.firstHit) AS order_details,
COUNT(s2.firstHit) AS order_confirmation
FROM (
SELECT
IFNULL(s3.date,
IFNULL(s0.date,
IFNULL(s1.date,
s2.date))) AS date,
s3.fullVisitorId,
s0.fullVisitorId,
s0.visitId,
s0.firstHit,
s1.firstHit,
s2.firstHit
FROM (
# user subquery
SELECT
date,
fullVisitorId,
visitId
FROM
`big-query-project-34643.162968675.ga_sessions_*`
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 365 DAY))
AND FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 0 DAY))
AND totals.visits = 1
GROUP BY
date,
fullVisitorId,
visitId) s3
FULL OUTER JOIN
((
# first subquery
SELECT
date,
fullVisitorId,
visitId,
MIN(h.hitNumber) AS firstHit
FROM
`big-query-project-34643.162968675.ga_sessions_*`, unnest(hits) as h
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 365 DAY))
AND FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 0 DAY))
AND REGEXP_CONTAINS(h.page.pagePath,
'/cart')
AND totals.visits = 1
GROUP BY
date,
fullVisitorId,
visitId)) s0
ON
s3.fullVisitorId = s0.fullVisitorId
AND s3.visitId = s0.visitId
FULL OUTER JOIN
((
# Second Subquery
SELECT
date,
fullVisitorId,
visitId,
MIN(h.hitNumber) AS firstHit
FROM
`big-query-project-34643.162968675.ga_sessions_*`, unnest(hits) as h
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 365 DAY))
AND FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 0 DAY))
AND REGEXP_CONTAINS(h.page.pagePath,
'/order-details')
AND totals.visits = 1
GROUP BY
date,
fullVisitorId,
visitId)) s1
ON
s0.fullVisitorId = s1.fullVisitorId
AND s0.visitId = s1.visitId
FULL OUTER JOIN
((
# Third Subquery
SELECT
date,
fullVisitorId,
visitId,
MIN(h.hitNumber) AS firstHit
FROM
`big-query-project-34643.162968675.ga_sessions_*`, unnest(hits) as h
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 365 DAY))
AND FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 0 DAY))
AND REGEXP_CONTAINS(h.page.pagePath,
'/orderconfirmation')
AND totals.visits = 1
GROUP BY
date,
fullVisitorId,
visitId)) s2
ON
s1.fullVisitorId = s2.fullVisitorId
AND s1.visitId = s2.visitId)
GROUP BY
date
ORDER BY
date
You have multiple issues here - notably: in lines 14-19
s3.fullVisitorId,
s0.fullVisitorId,
s0.visitId,
s0.firstHit,
s1.firstHit,
s2.firstHit
I would recommend to at least provide aliases like [for example] below
s3.fullVisitorId s3_fullVisitorId,
s0.fullVisitorId s0_fullVisitorId,
s0.visitId,
s0.firstHit s0_firstHit,
s1.firstHit s1_firstHit,
s2.firstHit s2_firstHit
and then in lines 4-7 - reference the alias
COUNT(DISTINCT s3_fullVisitorId) AS users,
COUNT(s0_firstHit) AS carts,
COUNT(s1_firstHit) AS order_details,
COUNT(s2_firstHit) AS order_confirmation
I wrote the following query below against our mobile app's data. Due to a high user-base, I am getting a 400 request error "Resources exceeded during query execution: The query could not be executed in the allotted memory" when I add the ORDER BY at the bottom.
Question: Is there anything that I can do to optimize the query, but still retain the ORDER BY at the bottom?
I already added in the firebase's demo data-set, but I think their data-set is just too small to have a problem (compared to my data-set which is 5-10 million records big).
SELECT
f.user_pseudo_id,
f.event_timestamp,
DATE(TIMESTAMP_MICROS(f.event_timestamp)) as event_timestamp_date,
f.event_name,
f.user_first_touch_timestamp,
DATE(TIMESTAMP_MICROS(f.user_first_touch_timestamp)) as user_first_touch_date,
CASE WHEN r.has_appRemove >= 1 THEN "removed" ELSE "not-removed" END AS status_after_first7days
FROM `firebase-analytics-sample-data.ios_dataset.app_events_*` f
LEFT JOIN (
SELECT user_pseudo_id, 1 has_appRemove
FROM `firebase-analytics-sample-data.ios_dataset.app_events_*`
WHERE DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)) >= DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY)
AND DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)) < DATE_SUB(CURRENT_DATE(), INTERVAL 9 DAY)
AND _TABLE_SUFFIX >= FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY))
AND _TABLE_SUFFIX < FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 3 DAY))
AND platform = "ANDROID"
AND event_name = "app_remove"
GROUP BY user_pseudo_id
) r on f.user_pseudo_id = r.user_pseudo_id
WHERE
DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)) >= DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY)
AND DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)) < DATE_SUB(CURRENT_DATE(), INTERVAL 9 DAY)
AND _TABLE_SUFFIX >= FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY))
AND _TABLE_SUFFIX < FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 3 DAY))
AND platform = "ANDROID"
ORDER BY 1,2 ASC
You can apply windowing/analytical function instead of join'ing - like in below example (not tested)
#standardSQL
SELECT
user_pseudo_id,
event_timestamp,
DATE(TIMESTAMP_MICROS(event_timestamp)) AS event_timestamp_date,
event_name,
user_first_touch_timestamp,
DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)) AS user_first_touch_date,
COUNTIF(event_name = "app_remove") OVER(PARTITION BY user_pseudo_id) > 0 isRemoved
FROM `firebase-analytics-sample-data.ios_dataset.app_events_*`
WHERE
DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)) >= DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY)
AND DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)) < DATE_SUB(CURRENT_DATE(), INTERVAL 9 DAY)
AND _TABLE_SUFFIX >= FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 10 DAY))
AND _TABLE_SUFFIX < FORMAT_DATE('%Y%m%d', DATE_SUB(CURRENT_DATE(), INTERVAL 3 DAY))
AND platform = "ANDROID"
ORDER BY 1,2 ASC
I'm using BigQuery to report on Google Analytics data. I'm trying to recreate landing page data using BigQuery.
The following query reports 18% fewer sessions than in the Google Analytics interface:
SELECT DISTINCT
fullVisitorId,
visitID,
h.page.pagePath AS LandingPage
FROM
`project-name.dataset.ga_sessions_*`, UNNEST(hits) AS h
WHERE
hitNumber = 1
AND h.type = 'PAGE'
AND _TABLE_SUFFIX BETWEEN '20170331' AND '20170331'
ORDER BY fullVisitorId DESC
Where am I going wrong with my approach? Why can't I get to within a small margin of the number in the GA interface's reported figure?
Multiple reasons :
1.Big Query for equivalent landing page:
SELECT
LandingPage,
COUNT(sessionId) AS Sessions,
100 * SUM(totals.bounces)/COUNT(sessionId) AS BounceRate,
AVG(totals.pageviews) AS AvgPageviews,
SUM(totals.timeOnSite)/COUNT(sessionId) AS AvgTimeOnSite,
from(
SELECT
CONCAT(fullVisitorId,STRING(visitId)) AS sessionID,
totals.bounces,
totals.pageviews,
totals.timeOnSite,
hits.page.pagePath AS landingPage
FROM (
SELECT
fullVisitorId,
visitId,
hits.page.pagePath,
totals.bounces,
totals.pageviews,
totals.timeOnSite,
MIN(hits.hitNumber) WITHIN RECORD AS firstHit,
hits.hitNumber AS hitNumber
FROM (TABLE_DATE_RANGE ([XXXYYYZZZ.ga_sessions_],TIMESTAMP('2016-08-01'), TIMESTAMP ('2016-08-31')))
WHERE
hits.type = 'PAGE'
AND hits.page.pagePath'')
WHERE
hitNumber = firstHit)
GROUP BY
LandingPage
ORDER BY
Sessions DESC,
LandingPage
Next :
Pre-calculated data -- pre-aggregated tables
These are the precalculated data that Google uses to speed up the UI. Google does not specify when this is done but it can be at any point of the time. These are known as pre-aggregated tables
So if you compare the numbers from GA UI to your Big Query output, you will always see a discrepancy. Please go ahead and rely on your big query data .
You can achieve the same thing by simply adding the below to your select statement:
,(SELECT page.pagePath FROM UNNEST(hits) WHERE hitnumber = (SELECT MIN(hitnumber) FROM UNNEST(hits) WHERE type = 'PAGE')) landingpage
I can get a 1 to 1 match with the GA UI on my end when I run something like below, which is a bit more concise than the original answer:
SELECT DISTINCT
a.landingpage
,COUNT(DISTINCT(a.sessionId)) sessions
,SUM(a.bounces) bounces
,AVG(a.avg_pages) avg_pages
,(SUM(tos)/COUNT(DISTINCT(a.sessionId)))/60 session_duration
FROM
(
SELECT DISTINCT
CONCAT(CAST(fullVisitorId AS STRING),CAST(visitStartTime AS STRING)) sessionId
,(SELECT page.pagePath FROM UNNEST(hits) WHERE hitnumber = (SELECT MIN(hitnumber) FROM UNNEST(hits) WHERE type = 'PAGE')) landingpage
,totals.bounces bounces
,totals.timeonsite tos
,(SELECT COUNT(hitnumber) FROM UNNEST(hits) WHERE type = 'PAGE') avg_pages
FROM `tablename_*`
WHERE _TABLE_SUFFIX >= '20180801'
AND _TABLE_SUFFIX <= '20180808'
AND totals.visits = 1
) a
GROUP BY 1
another way here! you can get the same number :
SELECT
LandingPage,
COUNT(DISTINCT(sessionID)) AS sessions
FROM(
SELECT
CONCAT(fullVisitorId,CAST(visitId AS STRING)) AS sessionID,
FIRST_VALUE(hits.page.pagePath) OVER (PARTITION BY CONCAT(fullVisitorId,CAST(visitId AS STRING)) ORDER BY hits.hitNumber ASC ) AS LandingPage
FROM
`xxxxxxxx1.ga_sessions_*`,
UNNEST(hits) AS hits
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
AND FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
AND hits.type ='PAGE'
GROUP BY fullVisitorId, visitId, sessionID,hits.page.pagePath,hits.hitNumber
)
GROUP BY LandingPage
ORDER BY sessions DESC
There is a hit.isEntrance field in the schema that can be used for this purpose.
The example below would show you yesterday's landing pages:
#standardSQL
select
date,
hits.page.pagePath as landingPage,
sum(totals.visits) as visits,
sum(totals.bounces) as bounces,
sum(totals.transactions) as transactions
from
`project.dataset.ga_sessions_*`,
unnest(hits) as hits
where
(_table_suffix
between format_date("%Y%m%d", date_sub(current_date(), interval 1 day))
and format_date("%Y%m%d", date_sub(current_date(), interval 1 day)))
and hits.isEntrance = True
and totals.visits = 1 #avoid counting midnight-split sessions
group by
1, 2
order by 3 desc
There is still one source of discrepancy though, which comes from the sessions without a landing page (if you check in GA in the landing pages report, there will sometimes be a (not set) value.
In order to include those as well, you can do:
with
landing_pages_set as (
select
concat(cast(fullVisitorId as string), cast(visitId as string), cast(date as string)) as fullVisitId,
hits.page.pagePath as virtualPagePath
from
`project.dataset.ga_sessions_*`,
unnest(hits) as hits
where
(_table_suffix
between format_date("%Y%m%d", date_sub(current_date(), interval 1 day))
and format_date("%Y%m%d", date_sub(current_date(), interval 1 day)))
and totals.visits = 1 #avoid counting midnight-split sessions
and hits.isEntrance = TRUE
group by 1, 2
),
landing_pages_not_set as (
select
concat(cast(fullVisitorId as string), cast(visitId as string), cast(date as string)) as fullVisitId,
date,
"(not set)" as virtualPagePath,
count(distinct concat(cast(fullVisitorId as string), cast(visitId as string), cast(date as string))) as visits,
sum(totals.bounces) as bounces,
sum(totals.transactions) as transactions
from
`project.dataset.ga_sessions_*`
where
(_table_suffix
between format_date("%Y%m%d", date_sub(current_date(), interval 1 day))
and format_date("%Y%m%d", date_sub(current_date(), interval 1 day)))
and totals.visits = 1 #avoid counting midnight-split sessions
group by 1, 2, 3
),
landing_pages as (
select
l.fullVisitId as fullVisitId,
date,
coalesce(r.virtualPagePath, l.virtualPagePath) as virtualPagePath,
visits,
bounces,
transactions
from
landing_pages_not_set l left join landing_pages_set r on l.fullVisitId = r.fullVisitId
)
select virtualPagePath, sum(visits) from landing_pages group by 1 order by 2 desc