From f18a2eb52008dd148d7d09aee469a7e5b2461e65 Mon Sep 17 00:00:00 2001 From: uri Date: Fri, 6 Sep 2024 17:08:50 +0200 Subject: [PATCH] First version of name unification for a deal --- models/intermediate/core/int_core__deal.sql | 61 +++++++++++++++++---- models/intermediate/core/schema.yaml | 15 ++++- 2 files changed, 65 insertions(+), 11 deletions(-) diff --git a/models/intermediate/core/int_core__deal.sql b/models/intermediate/core/int_core__deal.sql index f975148..59b1cbf 100644 --- a/models/intermediate/core/int_core__deal.sql +++ b/models/intermediate/core/int_core__deal.sql @@ -24,23 +24,64 @@ with c.country_name as main_billing_country_name_per_deal, c.iso_2 as main_billing_country_iso_2_per_deal, c.iso_3 as main_billing_country_iso_3_per_deal - from billing_countries_per_deal bcpd - left join int_core__country c - on bcpd.id_billing_country = c.id_country + from billing_countries_per_deal bcpd + left join int_core__country c on bcpd.id_billing_country = c.id_country where bcpd.rn = 1 + ), + potential_name_per_deal as ( + select + id_deal, + trim( + regexp_replace(potential_name, '[0-9\(\)\-\.\,@]', '', 'g') + ) as potential_deal_name, + count(*) as potential_name_count + from + ( + select id_deal, first_name as potential_name + from int_core__unified_user + where id_deal is not null + union all + select id_deal, last_name as potential_name + from int_core__unified_user + where id_deal is not null + union all + select id_deal, company_name as potential_name + from int_core__unified_user + where id_deal is not null + ) as names + -- The following removes tons of names that are just Bookings, + -- LTD, LLC, INC, or empty strings + where + trim(upper(potential_name)) not in ('BOOKINGS', 'LTD', 'LLC', 'INC', '') + and potential_name is not null + group by 1, 2 + ), + unique_name_per_deal as ( + select id_deal, potential_deal_name as main_deal_name + from + ( + select + id_deal, + potential_deal_name, + potential_name_count, + row_number() over ( + partition by id_deal order by potential_name_count desc + ) as rn + from potential_name_per_deal + ) as ranked_names + where rn = 1 ) select uu.id_deal, + unpd.main_deal_name, mbcpd.main_id_billing_country_per_deal, mbcpd.main_billing_country_name_per_deal, mbcpd.main_billing_country_iso_2_per_deal, mbcpd.main_billing_country_iso_3_per_deal, count(distinct uu.id_user) as users_with_this_id_deal, - count( - distinct uu.billing_country_iso_3 - ) as billing_countries_for_this_id_deal -from main_billing_country_per_deal mbcpd -left join int_core__unified_user uu - on uu.id_deal = mbcpd.id_deal + count(distinct uu.billing_country_iso_3) as billing_countries_for_this_id_deal +from main_billing_country_per_deal mbcpd +left join int_core__unified_user uu on uu.id_deal = mbcpd.id_deal +left join unique_name_per_deal unpd on uu.id_deal = unpd.id_deal where mbcpd.id_deal is not null -group by 1, 2, 3, 4, 5 +group by 1, 2, 3, 4, 5, 6 diff --git a/models/intermediate/core/schema.yaml b/models/intermediate/core/schema.yaml index 9ae1b06..d67a186 100644 --- a/models/intermediate/core/schema.yaml +++ b/models/intermediate/core/schema.yaml @@ -2971,7 +2971,9 @@ models: the same time, not all users that act as hosts have a deal associated. One example is Know Your Guest (KYG) Lite accounts. However, there's also historical cases that for whatever reason there's no Deal associated. - + For this model, the billing country and the deal name are estimated based on + the information available in int_core__unified_user. + columns: - name: id_deal data_type: character varying @@ -2979,6 +2981,17 @@ models: tests: - not_null - unique + - name: main_deal_name + data_type: string + description: | + Main name for this ID deal. It's a clean version of + the most repeated name within the user tables in the + fields of first_name, last_name and company name. + This field should be modified at the moment we have + a proper way to retrieve a common account name per deal. + It can contain duplicates. + tests: + - not_null - name: main_id_billing_country_per_deal data_type: integer description: |