Bigquery Google Analytics Users Not Counting Unique - google-analytics

I'm having issues with the below query in BigQuery for Google Analytics. For some reason I'm not able to count the number of users as unique, it essentially counts the number of rows so the numbers are very similar to sessions. I've also tried EXACT_COUNT_DISTINCT() but gives me the same answer.
SELECT
date AS Day,
MAX(CASE
WHEN hits.sourcePropertyInfo.sourcePropertyTrackingId CONTAINS '778****' THEN 'MUG'
WHEN hits.sourcePropertyInfo.sourcePropertyTrackingId = 'Social' THEN 'Social'ELSE 'Website' END) AS Property,
geoNetwork.country AS Country,
SUM(totals.visits) AS visits,
COUNT (DISTINCT(fullVisitorId), 1000000) AS Users,
SUM(IFNULL(totals.newVisits,0)) AS NEW,
(SUM(IFNULL(totals.screenviews,0))+SUM(IFNULL(totals.pageviews,0))) AS PAGEVIEWS,
IFNULL(SUM(CASE
WHEN totals.screenviews = 1 THEN SUM(IFNULL(totals.screenviews,0))
ELSE 0 END)+ SUM(IFNULL(totals.bounces,0)),0) AS BOUNCES,
SUM(CASE
WHEN REGEXP_MATCH(hits.eventInfo.eventAction,'register$|registersuccess|new registration|account signup|registro') THEN 1
ELSE 0 END) AS NewRegistrations,
SUM(CASE
WHEN REGEXP_MATCH(hits.eventInfo.eventAction, 'add to cart|add to bag|click to buy|ass to basket|comprar') OR hits.eventInfo.eventAction CONTAINS 'addtobasket::' THEN 1
ELSE 0 END) AS ClickToBuy,
SUM(IFNULL(totals.transactions,0)) AS Transactions,
SUM(IFNULL(totals.transactionRevenue,0))/1000000 AS Revenue
FROM (TABLE_DATE_RANGE([****.ga_sessions_], TIMESTAMP('2017-03-15'), TIMESTAMP('2017-03-31'))),
GROUP BY
Day,
Country,
geoNetwork.country,
totals.screenviews;

I just tested this query and it seems a bit simpler:
SELECT
date,
MAX(CASE
WHEN hits.sourcePropertyInfo.sourcePropertyTrackingId CONTAINS '778****' THEN 'MUG'
WHEN hits.sourcePropertyInfo.sourcePropertyTrackingId = 'Social' THEN 'Social'ELSE 'Website' END) AS Property,
geoNetwork.country AS Country,
SUM(totals.visits) AS visits,
COUNT(DISTINCT(fullVisitorId), 1000000) AS Users,
SUM(totals.newVisits) AS NEW,
SUM(totals.pageviews) AS PAGEVIEWS,
SUM(totals.bounces) AS BOUNCES,
SUM(CASE
WHEN REGEXP_MATCH(hits.eventInfo.eventAction,'register$|registersuccess|new registration|account signup|registro') THEN 1
ELSE 0 END) AS NewRegistrations,
SUM(CASE
WHEN REGEXP_MATCH(hits.eventInfo.eventAction, 'add to cart|add to bag|click to buy|ass to basket|comprar|addtobasket::') THEN 1
ELSE 0 END) AS ClickToBuy,
SUM(totals.transactions) AS Transactions,
SUM(totals.transactionRevenue) /1000000 AS Revenue
FROM (TABLE_DATE_RANGE([project_id:dataset_id.ga_sessions_], TIMESTAMP('2017-03-15'), TIMESTAMP('2017-03-31'))),
GROUP BY
date, Country
It did work in our database (not sure why you summed screenviews with pageviews though).
In Standard SQL (highly recommended that you use this version) maybe this already solves for you:
SELECT
date,
MAX(CASE
WHEN exists(select 1 from unnest(hits) hits where regexp_contains(hits.sourcePropertyInfo.sourcePropertyTrackingId, r'778\*\*\*\*')) THEN 'MUG'
WHEN exists(select 1 from unnest(hits) hits where hits.sourcePropertyInfo.sourcePropertyTrackingId = 'Social') THEN 'Social'ELSE 'Website' END) AS Property,
geoNetwork.country AS Country,
SUM(totals.visits) AS visits,
COUNT(DISTINCT(fullVisitorId)) AS Users,
SUM(totals.newVisits) AS new_,
SUM(totals.pageviews) AS PAGEVIEWS,
SUM(totals.bounces) AS BOUNCES,
SUM(CASE
WHEN exists(select 1 from unnest(hits) hits where REGEXP_contains(hits.eventInfo.eventAction,'register$|registersuccess|new registration|account signup|registro')) THEN 1
ELSE 0 END) AS NewRegistrations,
SUM(CASE
WHEN exists(select 1 from unnest(hits) hits where REGEXP_contains(hits.eventInfo.eventAction, 'add to cart|add to bag|click to buy|ass to basket|comprar|addtobasket::')) THEN 1
ELSE 0 END) AS ClickToBuy,
SUM(totals.transactions) AS Transactions,
SUM(totals.transactionRevenue) /1000000 AS Revenue
FROM `project_id.dataset_id.ga_sessions*`
where 1 = 1
and parse_timestamp("%Y%m%d", regexp_extract(_table_suffix, r'.*_(.*)')) between TIMESTAMP('2017-03-15') and TIMESTAMP('2017-03-31')
GROUP BY
date, Country

Related

How to replicate the Model Comparison Tool report from Google Analytics to Google BigQuery

I have the following report in the demo account of Google Analytics:
https://analytics.google.com/analytics/web/?utm_source=demoaccount&utm_medium=demoaccount&utm_campaign=demoaccount#/report/bf-roi-calculator/a54516992w87479473p92320289/_u.date00=20211101&_u.date01=20211128&_r.attrSel2=preset6&_r.attrSel1=preset1&_r.attrSel3=preset7/
In this report, we can see the different models of conversion attribution, e.g. Last Interaction, Last Non-Direct Click, and Last Google Ads Click. There are also other models, like First Interaction, Position based. Here's Google's documentation about the multi-channel funnels report:
https://support.google.com/analytics/topic/1191164?hl=en&ref_topic=1631741
So far, I have managed to build the following query:
-- Sessions with source/medium, hits, and page path
WITH table_1 AS (
SELECT
fullVisitorId,
visitStartTime,
CONCAT(fullVisitorId, visitId, date) AS session,
trafficSource.medium,
trafficSource.source,
ANY_VALUE(social.hasSocialSourceReferral) AS social_source_referral,
trafficSource.campaign,
ARRAY_AGG(hitNumber ORDER BY hitNumber) AS hit_number,
ARRAY_AGG(page.pagePath ORDER BY hitNumber) AS page_path
FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`, UNNEST(hits) AS hits_
WHERE _TABLE_SUFFIX BETWEEN '20170727' AND '20170801'
GROUP BY fullVisitorId, visitStartTime, session, medium, source, campaign),
-- Adding the MCF channel grouping and creating a field that indicates sessions with conversions
table_2 AS (
SELECT
fullVisitorId,
visitStartTime,
CASE
WHEN source = '(direct)' AND (medium = '(not set)' OR medium = '(none)') THEN 'Direct'
WHEN medium = 'organic' THEN 'Organic Search'
WHEN social_source_referral = 'Yes' AND REGEXP_CONTAINS(medium, r'^(social|social-network|social-media|sm|social network|social media)$') THEN 'Social'
WHEN medium = 'email' THEN 'Email'
WHEN medium = 'affiliate' THEN 'Affiliate'
WHEN medium = 'referral' THEN 'Referral'
WHEN REGEXP_CONTAINS(medium, r'^(cpc|ppc|paidsearch)$') THEN 'Paid Search'
WHEN REGEXP_CONTAINS(medium, r'^(cpv|cpa|cpp|content-text)$') THEN 'Other Advertising'
WHEN REGEXP_CONTAINS(medium, r'^(display|cpm|banner)$') THEN 'Display'
ELSE 'Other'
END AS mcf_channel_grouping,
medium,
source,
campaign,
CAST(
EXISTS(
SELECT *
FROM UNNEST(page_path) AS x
WHERE REGEXP_CONTAINS(x, r'^/ordercompleted\.html')
)
AS INT64
) AS conversion
FROM table_1
ORDER BY fullVisitorId
),
-- Filtering by sessions with conversions
table_3 AS (
SELECT *
FROM table_2
WHERE TRUE
QUALIFY COUNTIF(conversion = 1) OVER (PARTITION BY fullVisitorId) > 0
),
-- Adding the attribution models
table_4 AS (
SELECT
fullVisitorId,
DATE(TIMESTAMP_SECONDS(visitStartTime)) AS date,
visitStartTime AS date_sec,
mcf_channel_grouping,
medium,
source,
campaign,
conversion,
CASE
WHEN conversion > 0 AND visitStartTime > LAG(visitStartTime) OVER (PARTITION BY fullVisitorId ORDER BY visitStartTime) THEN '1'
WHEN conversion > 0 AND visitStartTime = FIRST_VALUE(visitStartTime) OVER (PARTITION BY fullVisitorId ORDER BY visitStartTime) THEN '1'
ELSE 'null'
END AS last_touch_attribution,
CASE
WHEN conversion > 0 AND visitStartTime = FIRST_VALUE(visitStartTime) OVER (PARTITION BY fullVisitorId ORDER BY visitStartTime) THEN '1'
WHEN conversion = 0 AND visitStartTime = FIRST_VALUE(visitStartTime) OVER (PARTITION BY fullVisitorId ORDER BY visitStartTime) THEN 'null'
WHEN conversion = 0 AND LAG(visitStartTime) OVER (PARTITION BY fullVisitorId ORDER BY visitStartTime) = FIRST_VALUE(visitStartTime) OVER (PARTITION BY fullVisitorId ORDER BY visitStartTime) THEN 'null'
WHEN SUM(conversion) OVER (PARTITION BY fullVisitorId) > 0 AND visitStartTime < LAST_VALUE(visitStartTime) OVER (PARTITION BY fullVisitorId ORDER BY visitStartTime ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
AND LEAD(source) OVER (PARTITION BY fullVisitorId ORDER BY visitStartTime) = 'direct' AND source != 'direct' THEN '1'
WHEN conversion > 0 AND visitStartTime = LAST_VALUE(visitStartTime) OVER (PARTITION BY fullVisitorId ORDER BY visitStartTime ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AND source != 'direct' THEN '1'
ELSE 'null'
END AS last_non_direct,
CASE
WHEN MAX(conversion) OVER (PARTITION BY fullVisitorId) = 1 AND visitStartTime = FIRST_VALUE(visitStartTime) OVER (PARTITION BY fullVisitorId ORDER BY visitStartTime) THEN '1'
ELSE 'null'
END AS first_touch_attribution,
CASE
WHEN MAX(conversion) OVER (PARTITION BY fullVisitorId) = 1 THEN '1'
ELSE 'null'
END AS any_touch_attribution,
CASE
WHEN MAX(conversion) OVER (PARTITION BY fullVisitorId) = 1 AND source = 'blog' THEN '1'
ELSE 'null'
END AS blog_only
FROM table_3
ORDER BY fullVisitorId, visitStartTime
)
SELECT *
FROM table_4
The issue I have is that the Last Non-Direct model is not calculated correctly and I don't know how to create the look-back window that allows me to set n days prior to conversion.
How could we replicate this report in BigQuery using Standard SQL? Thanks.

Firebase BigQuery Export Campaign Performance

I'm having a hard time replicating the metrics found when accessing in the Analytics console. Particularly user and session metrics when split by campaign details (extracted from 'campaign_details' event and attributed on a last interaction basis over 90 days). If I query the data without considering campaign_details my values are as in the console. I'd be interested to know if anyone has worked with this previously and managed to get the data as in the console or if it's even possible to expect parity?
with initial_prep as (
SELECT
(select max(value.string_value) from unnest(user_properties) where key='store') store,
device.operating_system as operating_system,
event_date,
user_pseudo_id,
event_timestamp,
TIMESTAMP_MICROS(event_timestamp) AS ts,
LAG(TIMESTAMP_MICROS(event_timestamp)) OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp) AS prev_evt_ts,
IF(event_name = "session_start", 1, 0) AS is_session_start_event,
IF(event_name = "first_open", 1, 0) AS is_first_visit_event,
IF(event_name = "screen_view", 1, 0) AS is_screen_view,
IF(event_name = "purchase",1,0) as is_purchase,
ecommerce.purchase_revenue as value,
ecommerce.shipping_value as shipping,
ecommerce.total_item_quantity as quantity,
FROM
`[PROJECT DETAILS REDACTED].events_20*`
WHERE
parse_date('%y%m%d', _table_suffix) between DATE_sub(current_date(), interval 1 day) and DATE_sub(current_date(), interval 1 day)
and
device.operating_system = 'IOS'
), user_sources as (
select
user_pseudo_id,
TIMESTAMP_MICROS(event_timestamp) AS ts,
(select max(value.string_value) from unnest(event_params) where key='source' and event_name in( 'campaign_details')) source,
(select max(value.string_value) from unnest(event_params) where key='campaign' and event_name in( 'campaign_details')) campaign
from
`[PROJECT DETAILS REDACTED].events_20*`
WHERE
parse_date('%y%m%d', _table_suffix) between DATE_sub(current_date(), interval 90 day) and DATE_sub(current_date(), interval 1 day)
and
event_name in ('campaign_details')
)
, session_id_created as (
SELECT
*,
SUM(is_session_start_event) OVER (PARTITION BY user_pseudo_id ORDER BY ts) AS session_id
FROM initial_prep
)
, session_details as (
SELECT
si.user_pseudo_id,
store,
operating_system,
event_date,
event_timestamp,
session_id,
MAX(is_session_start_event) OVER (PARTITION BY si.user_pseudo_id, session_id) AS has_session_start_event,
is_session_start_event,
MAX(is_first_visit_event) OVER (PARTITION BY si.user_pseudo_id, session_id) AS has_first_visit_event,
is_first_visit_event,
is_screen_view,
MAX(event_timestamp) OVER (PARTITION BY si.user_pseudo_id, session_id) AS max_timestamp,
MIN(event_timestamp) OVER (PARTITION BY si.user_pseudo_id, session_id) AS min_timestamp,
is_purchase,
value,
shipping,
quantity,
us.source,
us.campaign,
row_number() over (partition by si.user_pseudo_id, event_timestamp order by us.ts desc) rank,
us.ts time_campaign
from session_id_created si
left join user_sources us on us.user_pseudo_id = si.user_pseudo_id and si.ts >= us.ts -->= timestamp_sub(us.ts,interval 3600000 MICROSECOND)
)
, session_fin as (
select user_pseudo_id,
store,
operating_system,
source,
campaign,
event_date,
session_id,
has_session_start_event,
has_first_visit_event,
max_timestamp,
min_timestamp,
sum(is_session_start_event) sessions_alt,
sum(is_screen_view) screen_views,
sum(value) revenue,
sum(is_purchase) transactions,
sum(shipping) shipping,
sum(quantity) item_quantity
from session_details
where rank =1
group by
user_pseudo_id,
store,
event_date,
operating_system,
session_id,
source,
campaign,
has_session_start_event,
has_first_visit_event,
max_timestamp,
min_timestamp
)
select store
, operating_system
, source
, campaign
, event_date applicabledate
, sum(sessions_alt) sessions
, sum(transactions) transactions
, sum(revenue) local_revenue
, sum(shipping) local_shipping
, sum(item_quantity) item_quantity
, avg(max_timestamp/100000 - min_timestamp/100000) avgsessionduration
, count(distinct user_pseudo_id) users
, count(distinct case when has_first_visit_event = 1 then user_pseudo_id end) new_users
, sum(screen_views) screenviews
from session_fin
group by store, event_date, operating_system
, source
, campaign
order by users desc
/**/

Join unique UserIDs with previous month' first time UserIDs

I want to identify users that had a "first_open" event in month a (here: january) and came back to our in month b (here: february) with an "user_engagement" event.
My idea:
1. Create a table with all users who had a "first_open" event
2. Create a table with all users who had a "user_engagement" event
3. Join both tables on userID
4. Count Users who both had a "first_open" event in month a and month b and count all users from january with the "first_open" event
With the following query I am currently overcounting both the users in month a and b, because I am not counting all unqiue users for both event types.
With
users_first_open as (select
user_pseudo_id,
EXTRACT (Month FROM(DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)))) AS install_month,
event_name as firstopen
FROM
`table.events_*`
where _TABLE_SUFFIX BETWEEN '20190101'
AND '20190108' and event_name = "first_open" and
EXTRACT (Month FROM(DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)))) = 1
),
user_enagement_next_month as (select
user_pseudo_id,
EXTRACT (Month FROM(DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)))) AS engagement_month,
event_name as engagament_next_month
FROM
`table.events_*`
where _TABLE_SUFFIX BETWEEN '20190109'
AND '20190116' and event_name = "user_engagement"
and EXTRACT (Month FROM(DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)))) = 1),
cohort_raw as(
select
user_pseudo_id,
install_month,
engagement_month,
case when firstopen = "first_open" then 1 else 0 end as cohort_count_first_open,
case when engagament_next_month = "user_engagement" then 1 else 0 end as cohort_count_engagement
from
user_enagement_next_month
full join
users_first_open using (user_pseudo_id))--,
select
sum(case when cohort_count_first_open is not null then 1 else 0 end) as users_first_open,
(select sum(case when cohort_count_engagement is not null then 1 else 0 end) as u_engagement_open from cohort_raw where cohort_count_first_open = 1) as users_engagement_open
from cohort_raw
What I tried next was the following: group in table 2 "user_enagement_next_month" by userID, etc.
and create a sum of "first_open" case when and "engagement" case when results. With the later I then included the query to only count users whose count of these two was = 2
-
With
users_first_open as (select
user_pseudo_id,
EXTRACT (Month FROM(DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)))) AS install_month,
event_name as firstopen
FROM
`table.events_*`
where _TABLE_SUFFIX BETWEEN '20190101'
AND '20190131' and event_name = "first_open" and
EXTRACT (Month FROM(DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)))) = 1
),
user_enagement_next_month as (select
user_pseudo_id,
EXTRACT (Month FROM(DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)))) AS engagement_month,
event_name as engagament_next_month
FROM
`table.events_*`
where _TABLE_SUFFIX BETWEEN '20190201'
AND '20190228' and event_name = "session_start"
and EXTRACT (Month FROM(DATE(TIMESTAMP_MICROS(user_first_touch_timestamp)))) = 2
group by 1,2,3)--,
--cohort_raw as(
select
user_pseudo_id,
install_month,
engagement_month,
case when firstopen = "first_open" then 1 else 0 end as cohort_count_first_open,
case when engagament_next_month = "session_start" then 1 else 0 end as cohort_count_engagement
--case when user_pseudo_id is not null then 1 else 0 end as cohort_count_engagement
from
user_enagement_next_month
full join
users_first_open using (user_pseudo_id)),
cohort_agg as (
select *, cohort_count_first_open+cohort_count_engagement as cohort_sum
from cohort_raw
group by 1,2,3,4,5
order by 6 desc)
select
(select count(*) from users_first_open) as cohort_jan,
(select Sum(cohort_sum) from cohort_agg where cohort_sum = 2) as ret,
sum(case when cohort_count_first_open is not null then 1 else 0 end) as users_first_open,
(select sum(case when cohort_count_engagement is not null then 1 else 0 end) as u_engagement_open from cohort_raw where cohort_count_first_open = 1) as users_engagement_open
from cohort_agg
I expect a return rate of around 20%. My output at the moment is 54%, because in my query I am either overcounting or counting to little, because I assume my join does not work.
Maybe I don't clearly understand what you want, but try this one
with
users_first_open as (
select distinct -- is there duplicates for one user_id?
user_pseudo_id,
extract(
month from
timestamp_micros(user_first_touch_timestamp)
) as install_month
from
`table.events_201901*` -- longer prefixes generally perform better
where
_table_suffix between '01' and '31'
and event_name = 'first_open'
and extract(
month from
timestamp_micros(user_first_touch_timestamp)
) = 1
),
user_enagement_next_month as (
select distinct
user_pseudo_id,
extract(
month from
timestamp_micros(user_first_touch_timestamp)
) as engagement_month
from
`table.events_201902*` -- longer prefixes generally perform better
where
_table_suffix between '01' and '28'
and event_name = 'user_engagement'
and extract(
month from
timestamp_micros(user_first_touch_timestamp)
) = 2
)
select
ufo.install_month,
uenm.engagement_month,
count(*) as first_open_event_users_cnt,
count(uenm.user_pseudo_id) as user_engagement_event_users_cnt
from
users_first_open as ufo
left join user_enagement_next_month as uenm
on ufo.user_pseudo_id = uenm.user_pseudo_id
group by
1, 2

Recreate GA Funnel on BigQuery

I am trying to recreate the GA funnel (custom report on Google360) using BigQuery. The funnel on GA is using the unique count of events that happen on each page. I found this query online that is working for the most part:
SELECT
COUNT( s0.firstHit) AS Landing_Page,
COUNT( s1.firstHit) AS Model_Selection
from(
SELECT
s0.fullvisitorID,
s0.firstHit,
s1.firstHit,
FROM (
# Begin Subquery #1 aka s0
SELECT
fullvisitorID,
MIN(hits.hitNumber) AS firstHit
FROm [64269470.ga_sessions_20170720]
WHERE
hits.eventInfo.eventAction in ('landing_page')
AND totals.visits = 1
GROUP BY
fullvisitorID
) s0
# End Subquery #1 aka s0
left join (
# Begin Subquery #2 aka s1
SELECT
fullvisitorID,
MIN(hits.hitNumber) AS firstHit
FROM [64269470.ga_sessions_20170720]
WHERE
hits.eventInfo.eventAction in ('model_selection_page')
AND totals.visits = 1
GROUP BY
fullvisitorID,
) s1
ON
s0.fullvisitorID = s1.fullvisitorID
)
The query works fine and the value for landing page is the same as I can get on GA, but Model_Selection is about 10% higher. This difference also increases along the funnel (I only posted 2 steps for clarity).
Any idea what am I missing here?
This query does what you need but in Standard SQL Version:
#standardSQL
SELECT
SUM((SELECT COUNTIF(eventInfo.eventAction = 'landing_page') FROM UNNEST(hits))) Landing_Page,
SUM((SELECT COUNTIF(eventInfo.eventAction = 'model_selection_page') FROM UNNEST(hits) WHERE EXISTS(SELECT 1 FROM UNNEST(hits) WHERE eventInfo.eventAction = 'landing_page'))) Model_Selection
FROM `64269470.ga_sessions_20170720`
Just that. 4 lines, way faster and cheaper.
You can also play with simulated data, something like:
#standardSQL
WITH data AS(
SELECT '1' AS fullvisitorid, ARRAY<STRUCT<eventInfo STRUCT<eventAction STRING > >> [STRUCT(STRUCT('landing_page' AS eventAction) AS eventInfo)] AS hits UNION ALL
SELECT '1' AS fullvisitorid, ARRAY<STRUCT<eventInfo STRUCT<eventAction STRING > >> [STRUCT(STRUCT('landing_page' AS eventAction) AS eventInfo), STRUCT(STRUCT('landing_page' AS eventAction) AS eventInfo)] AS hits UNION ALL
SELECT '1' AS fullvisitorid, ARRAY<STRUCT<eventInfo STRUCT<eventAction STRING > >> [STRUCT(STRUCT('landing_page' AS eventAction) AS eventInfo), STRUCT(STRUCT('model_selection_page' AS eventAction) AS eventInfo)] AS hits UNION ALL
SELECT '1' AS fullvisitorid, ARRAY<STRUCT<eventInfo STRUCT<eventAction STRING > >> [STRUCT(STRUCT('model_selection_page' AS eventAction) AS eventInfo), STRUCT(STRUCT('model_selection_page' AS eventAction) AS eventInfo)] AS hits
)
SELECT
SUM((SELECT COUNTIF(eventInfo.eventAction = 'landing_page') FROM UNNEST(hits))) Landing_Page,
SUM((SELECT COUNTIF(eventInfo.eventAction = 'model_selection_page') FROM UNNEST(hits) WHERE EXISTS(SELECT 1 FROM UNNEST(hits) WHERE eventInfo.eventAction = 'landing_page'))) Model_Selection
FROM data
Notice that building this type of report in GA might be a bit more difficult as you need to select visitors who had at least fired once the event 'landing_page' and then had the event 'model_selection_page' fired. Make sure you got this report built correctly as well in your GA (one way might be to first build a customized report with only customers who had 'landing_page' fired and then apply the second filter looking for 'model_selection_page').
[EDIT]:
You asked in your comment about bringing this counting on the session and user level. For counting each session, you can limit the results to 1 for each sub-query evaluation, like so:
SELECT
SUM((SELECT 1 FROM UNNEST(hits) WHERE eventInfo.eventAction = 'landing_page' LIMIT 1)) Landing_Page,
SUM((SELECT 1 FROM UNNEST(hits) WHERE EXISTS(SELECT 1 FROM UNNEST(hits) WHERE eventInfo.eventAction = 'landing_page') AND eventInfo.eventAction = 'model_selection_page' LIMIT 1)) Model_Selection
FROM data
For counting distinct users, the idea is the same but you'd have to apply a COUNT(DISTINCT) operation, like so:
SELECT
COUNT(DISTINCT(SELECT fullvisitorid FROM UNNEST(hits) WHERE eventInfo.eventAction = 'landing_page' LIMIT 1)) Landing_Page,
COUNT(DISTINCT(SELECT fullvisitorid FROM UNNEST(hits) WHERE EXISTS(SELECT 1 FROM UNNEST(hits) WHERE eventInfo.eventAction = 'landing_page') AND eventInfo.eventAction = 'model_selection_page' LIMIT 1)) Model_Selection
FROM data

Big Query landing page figures not consistent with Google Analytics interface

I'm using BigQuery to report on Google Analytics data. I'm trying to recreate landing page data using BigQuery.
The following query reports 18% fewer sessions than in the Google Analytics interface:
SELECT DISTINCT
fullVisitorId,
visitID,
h.page.pagePath AS LandingPage
FROM
`project-name.dataset.ga_sessions_*`, UNNEST(hits) AS h
WHERE
hitNumber = 1
AND h.type = 'PAGE'
AND _TABLE_SUFFIX BETWEEN '20170331' AND '20170331'
ORDER BY fullVisitorId DESC
Where am I going wrong with my approach? Why can't I get to within a small margin of the number in the GA interface's reported figure?
Multiple reasons :
1.Big Query for equivalent landing page:
SELECT
LandingPage,
COUNT(sessionId) AS Sessions,
100 * SUM(totals.bounces)/COUNT(sessionId) AS BounceRate,
AVG(totals.pageviews) AS AvgPageviews,
SUM(totals.timeOnSite)/COUNT(sessionId) AS AvgTimeOnSite,
from(
SELECT
CONCAT(fullVisitorId,STRING(visitId)) AS sessionID,
totals.bounces,
totals.pageviews,
totals.timeOnSite,
hits.page.pagePath AS landingPage
FROM (
SELECT
fullVisitorId,
visitId,
hits.page.pagePath,
totals.bounces,
totals.pageviews,
totals.timeOnSite,
MIN(hits.hitNumber) WITHIN RECORD AS firstHit,
hits.hitNumber AS hitNumber
FROM (TABLE_DATE_RANGE ([XXXYYYZZZ.ga_sessions_],TIMESTAMP('2016-08-01'), TIMESTAMP ('2016-08-31')))
WHERE
hits.type = 'PAGE'
AND hits.page.pagePath'')
WHERE
hitNumber = firstHit)
GROUP BY
LandingPage
ORDER BY
Sessions DESC,
LandingPage
Next :
Pre-calculated data -- pre-aggregated tables
These are the precalculated data that Google uses to speed up the UI. Google does not specify when this is done but it can be at any point of the time. These are known as pre-aggregated tables
So if you compare the numbers from GA UI to your Big Query output, you will always see a discrepancy. Please go ahead and rely on your big query data .
You can achieve the same thing by simply adding the below to your select statement:
,(SELECT page.pagePath FROM UNNEST(hits) WHERE hitnumber = (SELECT MIN(hitnumber) FROM UNNEST(hits) WHERE type = 'PAGE')) landingpage
I can get a 1 to 1 match with the GA UI on my end when I run something like below, which is a bit more concise than the original answer:
SELECT DISTINCT
a.landingpage
,COUNT(DISTINCT(a.sessionId)) sessions
,SUM(a.bounces) bounces
,AVG(a.avg_pages) avg_pages
,(SUM(tos)/COUNT(DISTINCT(a.sessionId)))/60 session_duration
FROM
(
SELECT DISTINCT
CONCAT(CAST(fullVisitorId AS STRING),CAST(visitStartTime AS STRING)) sessionId
,(SELECT page.pagePath FROM UNNEST(hits) WHERE hitnumber = (SELECT MIN(hitnumber) FROM UNNEST(hits) WHERE type = 'PAGE')) landingpage
,totals.bounces bounces
,totals.timeonsite tos
,(SELECT COUNT(hitnumber) FROM UNNEST(hits) WHERE type = 'PAGE') avg_pages
FROM `tablename_*`
WHERE _TABLE_SUFFIX >= '20180801'
AND _TABLE_SUFFIX <= '20180808'
AND totals.visits = 1
) a
GROUP BY 1
another way here! you can get the same number :
SELECT
LandingPage,
COUNT(DISTINCT(sessionID)) AS sessions
FROM(
SELECT
CONCAT(fullVisitorId,CAST(visitId AS STRING)) AS sessionID,
FIRST_VALUE(hits.page.pagePath) OVER (PARTITION BY CONCAT(fullVisitorId,CAST(visitId AS STRING)) ORDER BY hits.hitNumber ASC ) AS LandingPage
FROM
`xxxxxxxx1.ga_sessions_*`,
UNNEST(hits) AS hits
WHERE
_TABLE_SUFFIX BETWEEN FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
AND FORMAT_DATE('%Y%m%d',DATE_SUB(CURRENT_DATE(), INTERVAL 1 DAY))
AND hits.type ='PAGE'
GROUP BY fullVisitorId, visitId, sessionID,hits.page.pagePath,hits.hitNumber
)
GROUP BY LandingPage
ORDER BY sessions DESC
There is a hit.isEntrance field in the schema that can be used for this purpose.
The example below would show you yesterday's landing pages:
#standardSQL
select
date,
hits.page.pagePath as landingPage,
sum(totals.visits) as visits,
sum(totals.bounces) as bounces,
sum(totals.transactions) as transactions
from
`project.dataset.ga_sessions_*`,
unnest(hits) as hits
where
(_table_suffix
between format_date("%Y%m%d", date_sub(current_date(), interval 1 day))
and format_date("%Y%m%d", date_sub(current_date(), interval 1 day)))
and hits.isEntrance = True
and totals.visits = 1 #avoid counting midnight-split sessions
group by
1, 2
order by 3 desc
There is still one source of discrepancy though, which comes from the sessions without a landing page (if you check in GA in the landing pages report, there will sometimes be a (not set) value.
In order to include those as well, you can do:
with
landing_pages_set as (
select
concat(cast(fullVisitorId as string), cast(visitId as string), cast(date as string)) as fullVisitId,
hits.page.pagePath as virtualPagePath
from
`project.dataset.ga_sessions_*`,
unnest(hits) as hits
where
(_table_suffix
between format_date("%Y%m%d", date_sub(current_date(), interval 1 day))
and format_date("%Y%m%d", date_sub(current_date(), interval 1 day)))
and totals.visits = 1 #avoid counting midnight-split sessions
and hits.isEntrance = TRUE
group by 1, 2
),
landing_pages_not_set as (
select
concat(cast(fullVisitorId as string), cast(visitId as string), cast(date as string)) as fullVisitId,
date,
"(not set)" as virtualPagePath,
count(distinct concat(cast(fullVisitorId as string), cast(visitId as string), cast(date as string))) as visits,
sum(totals.bounces) as bounces,
sum(totals.transactions) as transactions
from
`project.dataset.ga_sessions_*`
where
(_table_suffix
between format_date("%Y%m%d", date_sub(current_date(), interval 1 day))
and format_date("%Y%m%d", date_sub(current_date(), interval 1 day)))
and totals.visits = 1 #avoid counting midnight-split sessions
group by 1, 2, 3
),
landing_pages as (
select
l.fullVisitId as fullVisitId,
date,
coalesce(r.virtualPagePath, l.virtualPagePath) as virtualPagePath,
visits,
bounces,
transactions
from
landing_pages_not_set l left join landing_pages_set r on l.fullVisitId = r.fullVisitId
)
select virtualPagePath, sum(visits) from landing_pages group by 1 order by 2 desc

Resources