!pip install -q pandas openpyxl nltk wordcloud matplotlib bertopic langdetect transformers torch gensim pyLDAvis kaleido

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 0.0/981.5 kB ? eta -:--:--
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 981.5/981.5 kB 59.8 MB/s eta 0:00:00
  Preparing metadata (setup.py) ... done
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 154.7/154.7 kB 18.3 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 27.9/27.9 MB 96.7 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.6/2.6 MB 120.0 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 69.0/69.0 kB 8.8 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 49.3/49.3 kB 6.2 MB/s eta 0:00:00
  Building wheel for langdetect (setup.py) ... done

# On Colab, uncomment if you want the upload dialog:
# from google.colab import files
# files.upload()

# Or mount Drive:
# from google.colab import drive; drive.mount('/content/drive')

# After upload, the files live in /content/ — the default working dir.
import os
for f in ['Google_12_months.xlsx', 'Trustpilot_12_months.xlsx']:
    print(f, 'found' if os.path.exists(f) else 'MISSING — upload it first')

Google_12_months.xlsx found
Trustpilot_12_months.xlsx found

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

pd.set_option('display.max_colwidth', 120)
print('Ready.')

Ready.

google_df = pd.read_excel('Google_12_months.xlsx')
print(f"Google: {len(google_df):,} rows, {len(google_df.columns)} cols")
google_df.head(3)

Google: 23,250 rows, 7 cols

trustpilot_df = pd.read_excel('Trustpilot_12_months.xlsx')

# Data-quality note (Sonnet investigation 2026-04-25, basic/appendix_assets/
# location_investigation.json): 216 rows have numeric Location Name placeholders
# — 174 as '345' and 42 as '398'. Both are real PureGym UK reviews (same
# Business Unit ID and Webshop Name as every other row). The Sonnet pass on the
# review text shows each placeholder is a multi-site catch-all bucket rather
# than a single gym: '345' aggregates Wimbledon/Camden/Bermondsey/Greenwich/
# Woolwich/Sidcup/Grimsby/Basildon/Cheshunt; '398' is dominantly Shrewsbury
# but contaminated with Mansfield + Wrexham + Telford. They stay in
# overall sentiment/topic/emotion analysis but are excluded from
# location-specific top-N rankings later in the notebook.
_numeric_mask = trustpilot_df['Location Name'].astype(str).str.match(r'^\s*\d+\s*$', na=False)
print(f"Trustpilot: {len(trustpilot_df):,} rows, {len(trustpilot_df.columns)} cols")
print(f"  (of which {_numeric_mask.sum()} have numeric Location Name placeholders — kept)")
trustpilot_df.head(3)

Trustpilot: 16,673 rows, 15 cols
  (of which 216 have numeric Location Name placeholders — kept)

before_g, before_t = len(google_df), len(trustpilot_df)
google_df = google_df.dropna(subset=['Comment']).reset_index(drop=True)
trustpilot_df = trustpilot_df.dropna(subset=['Review Content']).reset_index(drop=True)
print(f"Google:     {before_g:,} -> {len(google_df):,} ({before_g - len(google_df):,} dropped)")
print(f"Trustpilot: {before_t:,} -> {len(trustpilot_df):,} ({before_t - len(trustpilot_df):,} dropped)")

Google:     23,250 -> 13,898 (9,352 dropped)
Trustpilot: 16,673 -> 16,673 (0 dropped)

from langdetect import detect, LangDetectException, DetectorFactory
DetectorFactory.seed = 0  # deterministic output

def detect_lang(text):
    try:
        return detect(str(text)[:500])  # cap 500 chars for speed
    except LangDetectException:
        return 'unknown'

# --- Google: no language metadata, so detect ---
print('Detecting language for Google reviews (~30-60s on A100)...')
google_df['detected_lang'] = google_df['Comment'].apply(detect_lang)
print('\nGoogle language distribution (top 10):')
print(google_df['detected_lang'].value_counts().head(10))

# --- Trustpilot: use the built-in Review Language column ---
print('\nTrustpilot Review Language column (top 10):')
print(trustpilot_df['Review Language'].value_counts().head(10))

# --- Filter to English-only ---
before_g, before_t = len(google_df), len(trustpilot_df)

google_non_en = google_df[google_df['detected_lang'] != 'en'].copy()
trustpilot_non_en = trustpilot_df[trustpilot_df['Review Language'] != 'en'].copy()

google_df = google_df[google_df['detected_lang'] == 'en'].reset_index(drop=True)
trustpilot_df = trustpilot_df[trustpilot_df['Review Language'] == 'en'].reset_index(drop=True)

print(f'\nGoogle:     {before_g:,} -> {len(google_df):,} '
      f'({len(google_non_en):,} non-English dropped, {len(google_non_en)/before_g*100:.1f}%)')
print(f'Trustpilot: {before_t:,} -> {len(trustpilot_df):,} '
      f'({len(trustpilot_non_en):,} non-English dropped, {len(trustpilot_non_en)/before_t*100:.1f}%)')

Detecting language for Google reviews (~30-60s on A100)...

Google language distribution (top 10):
detected_lang
en    11879
da      449
de      399
cy      321
fr      127
ca       77
af       71
so       62
es       55
no       51
Name: count, dtype: int64

Trustpilot Review Language column (top 10):
Review Language
en    16581
da       34
pl        9
pt        9
es        9
it        6
ro        6
fr        4
de        4
bg        1
Name: count, dtype: int64

Google:     13,898 -> 11,879 (2,019 non-English dropped, 14.5%)
Trustpilot: 16,673 -> 16,581 (92 non-English dropped, 0.6%)

print("Google unique Club's Name:", google_df["Club's Name"].nunique())
print("Trustpilot unique Location Name:", trustpilot_df['Location Name'].nunique())

# Sorted list of Trustpilot locations — scan for near-duplicates
print("\nTrustpilot locations (sorted — watch for PureGym vs Pure Gym, trailing spaces, postcode suffixes):")
for loc in sorted(trustpilot_df['Location Name'].dropna().astype(str).unique()):
    print(f"  {loc}")

Google unique Club's Name: 455
Trustpilot unique Location Name: 376

Trustpilot locations (sorted — watch for PureGym vs Pure Gym, trailing spaces, postcode suffixes):
  345
  398
  Aberdeen Kittybrewster
  Aberdeen Rubislaw
  Aberdeen Shiprow
  Aberdeen Wellington Circle
  Aintree
  Aldershot Westgate Retail Park
  Alloa
  Altrincham
  Andover
  Ashford Warren Retail Park
  Ashton-Under-Lyne
  Aylesbury
  Ballymena
  Banbury Cross Retail Park
  Bangor Northern Ireland
  Bangor Wales
  Barnstaple
  Basildon
  Bath Spring Wharf
  Bath Victoria Park
  Bedford Heights
  Belfast Adelaide Street
  Belfast Boucher Road
  Belfast St Anne's Square
  Bicester
  Billericay
  Birmingham Arcadian Centre
  Birmingham Beaufort Park
  Birmingham City Centre
  Birmingham Longbridge
  Birmingham Maypole
  Birmingham Snow Hill Plaza
  Birmingham West
  Blackburn The Mall
  Bletchley
  Blyth
  Borehamwood
  Boston
  Bournemouth Mallard Road
  Bournemouth the Triangle
  Bracknell
  Bradford Idle
  Bradford Thornbury
  Bridgwater
  Brierley Hill
  Brighton Central
  Brighton London Road
  Bristol Abbey Wood Retail Park
  Bristol Brislington
  Bristol Eastgate
  Bristol Harbourside
  Bristol Union Gate
  Broadstairs
  Bromborough
  Bromsgrove Retail Park
  Buckingham
  Burgess Hill
  Burnham
  Bury
  Byfleet
  Caerphilly
  Camberley
  Cambridge Grafton Centre
  Cambridge Leisure Park
  Camden
  Cannock Orbital Retail Park
  Canterbury Riverside
  Canterbury Sturry Road
  Cardiff Bay
  Cardiff Central
  Cardiff Gate
  Cardiff Western Avenue
  Catford Rushey Green
  Chatham
  Chelmsford Meadows
  Cheshunt Brookfield Shopping Park
  Chester
  Chippenham
  Cirencester Retail Park
  Colchester Retail Park
  Coleraine
  Colne
  Consett
  Corby
  Coventry Bishop Street
  Coventry Skydome
  Coventry Warwickshire Shopping Park
  Crayford
  Crewe Grand Junction
  Dagenham
  Denton
  Derby
  Derby Kingsway
  Derry Londonderry
  Didcot
  Doncaster
  Dover
  Dudley Tipton
  Dumfries
  Dundee
  Dunfermline
  Durham Arnison
  East Grinstead
  East Kilbride
  Eastbourne
  Edinburgh Craigleith, ID 317
  Edinburgh Exchange Crescent
  Edinburgh Fort Kinnaird
  Edinburgh Ocean Terminal
  Edinburgh Quartermile
  Edinburgh Waterfront
  Edinburgh West
  Elgin
  Epsom
  Evesham
  Exeter Bishops Court
  Exeter Fore Street
  Falkirk
  Fareham
  Folkestone
  Galashiels
  Gateshead
  Glasgow Bath Street
  Glasgow Charing Cross
  Glasgow Clydebank
  Glasgow Giffnock
  Glasgow Hope Street
  Glasgow Milngavie
  Glasgow Robroyston
  Glasgow Shawlands
  Glasgow Silverburn
  Glossop
  Gloucester Quedgeley
  Gloucester Retail Park
  Grantham Discovery Retail Park
  Gravesend
  Great Yarmouth
  Grimsby
  Halifax
  Harlow
  Harrogate
  Hatfield
  Haverhill
  Heanor
  Hednesford Cannock
  Hemel Hempstead
  Hereford
  Hitchin
  Hull Anlaby
  Inverness Inshes Retail Park
  Ipswich Buttermarket
  Ipswich Ravenswood
  Kirkcaldy
  Knarebsorough
  Leamington Spa
  Leeds Bramley
  Leeds City Centre North
  Leeds City Centre South
  Leeds Hunslet
  Leeds Kirkstall Bridge
  Leeds Regent Street
  Leeds Thorpe Park
  Leicester St Georges Way
  Leicester Walnut Street
  Lichfield
  Lincoln
  Lincoln Carlton Centre
  Linlithgow
  Lisburn Laganbank
  Liverpool Brunswick
  Liverpool Central
  Liverpool Edge Lane
  Livingston
  Llantrisant
  London Acton
  London Aldgate
  London Angel
  London Bank
  London Bayswater
  London Beckton
  London Bermondsey
  London Borough
  London Bow Wharf
  London Bromley
  London Camberwell New Road
  London Camberwell Southampton Way
  London Canary Wharf
  London Charlton
  London Clapham
  London Colindale
  London Crouch End
  London Croydon
  London East India Dock
  London East Sheen
  London Edgware
  London Enfield
  London Farringdon
  London Finchley
  London Finsbury Park
  London Fulham
  London Great Portland Street
  London Greenwich
  London Greenwich Movement
  London Hammersmith Palais
  London Hayes
  London Holborn
  London Holloway Road
  London Hoxton
  London Ilford
  London Kentish Town
  London Kidbrooke Village
  London Kingston
  London Lambeth
  London Lewisham
  London Leytonstone
  London Limehouse
  London Marylebone
  London Muswell Hill
  London North Finchley
  London Orpington Central
  London Oval
  London Park Royal
  London Piccadilly
  London Putney
  London Seven Sisters
  London Shoreditch
  London Southgate
  London St Pauls
  London Stratford
  London Streatham
  London Swiss Cottage
  London Sydenham
  London Tottenham Court Road
  London Tower Hill
  London Twickenham
  London Wall
  London Wandsworth
  London Waterloo
  London Wembley
  London Whitechapel
  Loughborough
  Luton and Dunstable
  Macclesfield Silk Road
  Maidenhead
  Maidstone The Mall
  Maldon Blackwater Retail Park
  Manchester Bury New Road
  Manchester Cheetham Hill
  Manchester Debdale
  Manchester Eccles
  Manchester Exchange Quay
  Manchester First Street
  Manchester Market Street
  Manchester Moston
  Manchester Spinningfields
  Manchester Stretford
  Manchester Urban Exchange
  Mansfield
  Merthyr Tydfil
  Milton Keynes Kingston Centre
  Milton Keynes Winterhill
  Motherwell
  New Barnet
  Newbury
  Newcastle Eldon Garden
  Newcastle Longbenton
  Newcastle St James
  Newport Gwent
  Newry
  Newtownabbey
  Northallerton
  Northampton Central
  Northampton Weston Favell
  Northolt
  Northwich
  Norwich Aylsham Road
  Norwich Castle Mall
  Norwich Riverside
  Nottingham Basford
  Nottingham Beeston
  Nottingham Castle Marina
  Nottingham Colwick
  Nottingham West Bridgford
  Nuneaton
  Oldham
  Ormskirk
  Oxford Central
  Oxford Templars Shopping Park
  Paisley
  Palmers Green
  Peterborough Brotherhood Retail Park
  Peterborough Serpentine Green
  Plymouth Alexandra Road
  Plymouth Marsh Mills
  Poole
  Port Talbot
  Portishead
  Portsmouth Commercial Road
  Portsmouth North Harbour
  Preston
  Purley
  Rayleigh
  Reading Basingstoke Road
  Reading Calcot
  Reading Caversham Road
  Redditch
  Redditch Ringway
  Rochdale
  Romford
  Runcorn
  Rushden
  Saffron Walden
  Salford
  Salisbury
  Sevenoaks
  Sheffield City Centre South
  Sheffield Crystal Peaks
  Sheffield Meadowhall
  Sheffield Millhouses
  Solihull Sears Retail Park
  South Ruislip
  Southampton Bitterne
  Southampton Central
  Southampton Shirley
  Southend Fossetts Park
  Southport
  St Albans
  St Ives
  Stafford
  Staines
  Stevenage
  Stirling
  Stockport North
  Stockport South
  Stoke on Trent North
  Stoke-on-Trent East
  Stowmarket
  Stratford upon Avon
  Sunderland
  Sutton Coldfield
  Sutton Times Square
  Swindon Mannington Retail Park
  Swindon Stratton
  Taunton Riverside
  Telford
  Tonbridge
  Torquay Bridge Retail Park
  Trowbridge
  Tunbridge Wells
  Tyldesley
  Uttoxeter
  Wakefield
  Walsall
  Walsall Crown Wharf
  Walton-on-Thames
  Warrington Central
  Warrington North
  Waterlooville
  Watford Waterfields
  West Bromwich
  West Thurrock
  Weston-super-Mare
  Widnes
  Wirral Bidston Moss
  Wisbech
  Witney
  Woking
  Wolverhampton Bentley Bridge
  Wolverhampton South
  Worcester
  Wrexham
  Yate
  Yeovil Houndstone Retail Park
  York

# Optional: paste in near-duplicate mappings you spot in the sorted list above.
# Example: 'Pure Gym Aberdeen': 'PureGym Aberdeen'
manual_map = {
    # 'Pure Gym Aberdeen': 'PureGym Aberdeen',
    # 'PureGym Aberdeen Beach Blvd': 'PureGym Aberdeen',
}

if manual_map:
    trustpilot_df['Location Name'] = trustpilot_df['Location Name'].replace(manual_map)
    print(f"After manual consolidation: {trustpilot_df['Location Name'].nunique()} unique locations")
else:
    print("No manual mappings applied — skipping.")

No manual mappings applied — skipping.

# Build common-locations sets from both platforms.
#
# Scope note: PureGym operates internationally (UK + Switzerland + Denmark
# per the Google export — sites like 'Bachenbülach', 'Roskilde', 'Adliswil',
# 'Oftringen' are real Swiss/Danish PureGym branches). The English-only
# language filter applied earlier naturally excludes those reviews, so this
# analysis is implicitly scoped to UK operations. The international locations
# stay as a methodology footnote and are not in the top-N rankings below.
g_locs = set(google_df["Club's Name"].dropna().astype(str).unique())
t_locs = set(trustpilot_df['Location Name'].dropna().astype(str).unique())

print(f"Naive intersection:       {len(g_locs & t_locs)}")

def norm(s):
    s = str(s).lower().strip()
    for prefix in ('puregym ', 'pure gym ', 'pg '):
        if s.startswith(prefix):
            s = s[len(prefix):]
    return s.strip()

g_norm = {norm(x): x for x in g_locs}
t_norm = {norm(x): x for x in t_locs}
common_keys = set(g_norm) & set(t_norm)
print(f"Normalised intersection:  {len(common_keys)}")

# Hand-curated cross-platform merges (rapidfuzz token_set_ratio scan
# 2026-04-25, all >=90 confidence + Pierre review). Each entry maps a
# Trustpilot Location Name -> the canonical Google Club's Name. Most are
# 'Retail Park' / 'Mall' suffix variance; one is the 'Knarebsorough' typo.
MANUAL_MERGES = {
    'Aberdeen Wellington Circle':         'Aberdeen Wellington',
    'Aldershot Westgate Retail Park':     'Aldershot - Westgate',
    'Ashford Warren Retail Park':         'Ashford',
    'Banbury Cross Retail Park':          'Banbury Cross',
    'Birmingham Snow Hill Plaza':         'Birmingham Snow Hill',
    'Broadstairs':                        'Broadstairs Westwood Gateway Retail Park',
    'Catford Rushey Green':               'London Catford',
    'Chelmsford Meadows':                 'Chelmsford - The Meadows',
    'Cirencester Retail Park':            'Cirencester',
    'Crewe Grand Junction':               'Crewe Grand Junction Retail Park',
    'Grantham Discovery Retail Park':     'Grantham',
    'Haverhill':                          'Haverhill Retail Park',
    'Inverness Inshes Retail Park':       'Inverness Inshes',
    'Knarebsorough':                      'Knaresborough',  # typo fix
    'London Shoreditch':                  'London Shoreditch High Street',
    'Macclesfield Silk Road':             'Macclesfield',
    'Maldon Blackwater Retail Park':      'Maldon',
    'Peterborough Serpentine Green':      'Peterborough Serpentine',
    'Solihull Sears Retail Park':         'Solihull',
    'St Ives':                            'St Ives Cambridgeshire',
    'Taunton Riverside':                  'Taunton',
    'Torquay Bridge Retail Park':         'Torquay',
    'Yeovil Houndstone Retail Park':      'Yeovil Houndstone',
}
# Apply the merges to extend the common-locations set.
for tp_name, g_name in MANUAL_MERGES.items():
    if g_name in g_locs and tp_name in t_locs:
        common_keys.add(norm(g_name))
        g_norm.setdefault(norm(g_name), g_name)
        t_norm[norm(g_name)] = tp_name  # tag the Trustpilot side under the canonical key
print(f"After manual merges:      {len(common_keys)}")

common_google = {g_norm[k] for k in common_keys}
common_trustpilot = {t_norm[k] for k in common_keys}

Naive intersection:       310
Normalised intersection:  312
After manual merges:      335

stop_words = set(stopwords.words('english'))

# Brand stops
stop_words |= {'pure', 'gym', 'puregym', 'puregyms'}

# Generic English filler NLTK english misses — surfaced by negative-review top-15
GENERIC_STOPS = {
    # generic verbs + inflections
    'get', 'got', 'getting', 'gotten',
    'go', 'going', 'gone', 'went', 'goes',
    'take', 'took', 'taken', 'taking', 'takes',
    'see', 'seen', 'saw', 'seeing',
    'come', 'came', 'coming', 'comes',
    'make', 'made', 'making', 'makes',
    'know', 'knew', 'known', 'knowing', 'knows',
    'think', 'thought', 'thinking', 'thinks',
    'want', 'wanted', 'wanting',
    'use', 'used', 'using', 'uses',
    'say', 'said', 'says', 'saying',
    'give', 'gave', 'given', 'giving',
    'find', 'found', 'finding',
    'look', 'looked', 'looking', 'looks',
    'tell', 'told', 'telling',
    # modals (some may overlap NLTK, harmless)
    'would', 'could', 'should', 'might', 'must', 'may',
    # generic intensifiers / adjectives
    'good', 'better', 'best', 'bad', 'worse', 'worst',
    'nice', 'great', 'big', 'small',
    'much', 'many', 'lot', 'lots', 'plenty',
    'like', 'unlike',
    'also', 'even', 'just', 'really', 'still', 'though',
    'always', 'never', 'often', 'sometimes', 'usually',
    'almost',
    # generic nouns / time
    'time', 'times',
    'day', 'days', 'week', 'weeks', 'month', 'months', 'year', 'years',
    'way', 'ways',
    'thing', 'things',
    'people', 'person',
    'one', 'ones', 'two', 'three',
    'etc',
}
stop_words |= GENERIC_STOPS

def preprocess(text):
    text = str(text).lower()
    text = ''.join(c for c in text if not c.isdigit())
    tokens = [w for w in text.split() if w.isalpha() and w not in stop_words]
    return ' '.join(tokens)

google_df['clean'] = google_df['Comment'].apply(preprocess)
trustpilot_df['clean'] = trustpilot_df['Review Content'].apply(preprocess)

print('Example:')
print(' raw  :', google_df['Comment'].iloc[0][:120])
print(' clean:', google_df['clean'].iloc[0][:120])

Example:
 raw  : Too many students from two local colleges go her leave rubbish in changing rooms and sit there like there in a canteen. 
 clean: students local colleges leave rubbish changing rooms sit cancel membership disgusting students hanging around machines m

google_df['tokens'] = google_df['clean'].apply(word_tokenize)
trustpilot_df['tokens'] = trustpilot_df['clean'].apply(word_tokenize)
print("First Google token list:", google_df['tokens'].iloc[0][:15])

First Google token list: ['students', 'local', 'colleges', 'leave', 'rubbish', 'changing', 'rooms', 'sit', 'cancel', 'membership', 'disgusting', 'students', 'hanging', 'around', 'machines']

google_words = [w for toks in google_df['tokens'] for w in toks]
trustpilot_words = [w for toks in trustpilot_df['tokens'] for w in toks]

google_fd = FreqDist(google_words)
trustpilot_fd = FreqDist(trustpilot_words)

print("Google top 20:    ", google_fd.most_common(20))
print("\nTrustpilot top 20:", trustpilot_fd.most_common(20))

Google top 20:     [('equipment', 2435), ('staff', 2119), ('classes', 1715), ('friendly', 1358), ('clean', 1272), ('machines', 1241), ('class', 1048), ('place', 993), ('busy', 901), ('well', 836), ('love', 820), ('need', 767), ('work', 752), ('changing', 675), ('weights', 658), ('workout', 607), ('free', 561), ('new', 560), ('recommend', 557), ('around', 554)]

Trustpilot top 20: [('equipment', 3179), ('staff', 2829), ('friendly', 2077), ('easy', 2019), ('clean', 1792), ('classes', 1758), ('machines', 1368), ('well', 1071), ('membership', 927), ('need', 915), ('class', 870), ('helpful', 857), ('work', 852), ('changing', 731), ('feel', 728), ('place', 723), ('love', 720), ('first', 691), ('new', 649), ('joining', 642)]

fig, axes = plt.subplots(1, 2, figsize=(14, 4.5))
for ax, fd, title, color in [
    (axes[0], google_fd, 'Google', '#4285F4'),
    (axes[1], trustpilot_fd, 'Trustpilot', '#00B67A'),
]:
    words, counts = zip(*fd.most_common(10))
    bars = ax.bar(words, counts, color=color, edgecolor='white', linewidth=0.5)
    ax.set_title(f'{title} — top 10 words', fontsize=13, fontweight='bold', pad=10)
    ax.set_ylabel('Frequency')
    ax.tick_params(axis='x', rotation=35)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.grid(axis='y', alpha=0.25, linestyle='--')
    for bar, c in zip(bars, counts):
        ax.text(bar.get_x() + bar.get_width() / 2, c + max(counts) * 0.01,
                f'{c:,}', ha='center', fontsize=9, color='#444')
plt.tight_layout(); plt.show()

from wordcloud import WordCloud

google_blue_cmap, trust_green_cmap = 'Blues', 'Greens'
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for ax, df, title, cmap in [
    (axes[0], google_df, 'Google', google_blue_cmap),
    (axes[1], trustpilot_df, 'Trustpilot', trust_green_cmap),
]:
    text = ' '.join(df['clean'].astype(str))
    wc = WordCloud(width=900, height=500, background_color='white', colormap=cmap,
                   max_words=120, collocations=False).generate(text)
    ax.imshow(wc, interpolation='bilinear')
    ax.axis('off')
    ax.set_title(f'{title}: all reviews', fontsize=14, fontweight='bold', pad=8)

plt.tight_layout()
# Hero figure for the report — saved to the Colab working dir.
# After Run All, download from the left sidebar to commit alongside the .ipynb.
plt.savefig('hero_wordcloud.png', dpi=150, bbox_inches='tight', facecolor='white')
plt.show()

google_neg = google_df[google_df['Overall Score'] < 3].reset_index(drop=True)
trustpilot_neg = trustpilot_df[trustpilot_df['Review Stars'] < 3].reset_index(drop=True)
print(f"Google negatives:     {len(google_neg):,}")
print(f"Trustpilot negatives: {len(trustpilot_neg):,}")

# Frequency + wordcloud, negatives only
gn_fd = FreqDist([w for toks in google_neg['tokens'] for w in toks])
tn_fd = FreqDist([w for toks in trustpilot_neg['tokens'] for w in toks])
print("\nGoogle neg top 15:    ", gn_fd.most_common(15))
print("Trustpilot neg top 15:", tn_fd.most_common(15))

fig, axes = plt.subplots(2, 2, figsize=(14, 8))
for i, (df, fd, title, color, cmap) in enumerate([
    (google_neg, gn_fd, 'Google neg', '#4285F4', 'Blues'),
    (trustpilot_neg, tn_fd, 'Trustpilot neg', '#00B67A', 'Greens'),
]):
    # bar chart
    words, counts = zip(*fd.most_common(10))
    bars = axes[i][0].bar(words, counts, color=color, edgecolor='white', linewidth=0.5)
    axes[i][0].set_title(f'{title} — top 10 words', fontsize=12, fontweight='bold', pad=8)
    axes[i][0].tick_params(axis='x', rotation=35)
    axes[i][0].spines['top'].set_visible(False)
    axes[i][0].spines['right'].set_visible(False)
    axes[i][0].grid(axis='y', alpha=0.25, linestyle='--')
    for bar, c in zip(bars, counts):
        axes[i][0].text(bar.get_x() + bar.get_width() / 2, c + max(counts) * 0.01,
                        f'{c:,}', ha='center', fontsize=9, color='#444')
    # wordcloud
    wc = WordCloud(width=600, height=300, background_color='white', colormap=cmap,
                   max_words=80, collocations=False).generate(' '.join(df['clean']))
    axes[i][1].imshow(wc, interpolation='bilinear')
    axes[i][1].axis('off')
    axes[i][1].set_title(f'{title} — wordcloud', fontsize=12, fontweight='bold', pad=8)
plt.tight_layout(); plt.show()

Google negatives:     2,423
Trustpilot negatives: 3,508

Google neg top 15:     [('equipment', 657), ('staff', 629), ('machines', 431), ('changing', 280), ('place', 276), ('membership', 250), ('weights', 243), ('work', 234), ('around', 226), ('need', 208), ('air', 205), ('broken', 204), ('gyms', 196), ('members', 192), ('enough', 190)]
Trustpilot neg top 15: [('equipment', 558), ('membership', 556), ('staff', 535), ('machines', 373), ('email', 313), ('work', 312), ('member', 310), ('changing', 287), ('pay', 273), ('classes', 272), ('members', 256), ('pin', 247), ('customer', 246), ('need', 241), ('code', 241)]

g_common = google_neg[google_neg["Club's Name"].isin(common_google)]
t_common = trustpilot_neg[trustpilot_neg['Location Name'].isin(common_trustpilot)]

# Merge the review texts (raw, not the cleaned tokens — BERTopic needs sentences)
reviews_common = (g_common['Comment'].astype(str).tolist()
                  + t_common['Review Content'].astype(str).tolist())
print(f"Google negatives at common locations:     {len(g_common):,}")
print(f"Trustpilot negatives at common locations: {len(t_common):,}")
print(f"Combined list of reviews:                 {len(reviews_common):,}")

Google negatives at common locations:     2,163
Trustpilot negatives at common locations: 1,974
Combined list of reviews:                 4,137

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

custom_stops = list(stopwords.words('english')) + ['pure', 'gym', 'puregym', 'puregyms']
vectorizer = CountVectorizer(stop_words=custom_stops, min_df=2, ngram_range=(1, 2))


def make_umap():
    """Fresh seeded UMAP — BERTopic needs one instance per fit_transform call.

    Seed promoted from feedback_bertopic_seed_umap.md (2026-04-18): without
    seeding, topic indices shuffle between runs and the themes dict drifts.
    Parameters mirror BERTopic's defaults."""
    return UMAP(
        n_neighbors=15, n_components=5, min_dist=0.0,
        metric='cosine', random_state=42,
    )


topic_model = BERTopic(vectorizer_model=vectorizer, umap_model=make_umap(),
                       min_topic_size=20, verbose=False)
topics, probs = topic_model.fit_transform(reviews_common)
print(f"Topics found: {topic_model.get_topic_info().shape[0]} (incl. -1 outlier bucket)")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

topic_info = topic_model.get_topic_info()
topic_info.head(15)

top2 = [t for t in topic_info['Topic'] if t != -1][:2]
for t in top2:
    words = topic_model.get_topic(t)
    print(f"Topic {t}: {[w for w, _ in words]}")

Topic 0: ['membership', 'pass', 'pin', 'day', 'code', 'get', 'access', 'day pass', 'email', 'didnt']
Topic 1: ['air', 'hot', 'air conditioning', 'conditioning', 'air con', 'con', 'ac', 'aircon', 'temperature', 'summer']

# Plotly figure with PNG fallback for nbviewer / non-widget Jupyter renderers.
fig = topic_model.visualize_topics()
try:
    fig.write_image('topics_full.png', width=1200, height=800, scale=2)
    from IPython.display import Image, display
    display(Image('topics_full.png'))
except Exception as exc:
    print(f"PNG export failed (likely missing kaleido): {exc}")
fig

PNG export failed (likely missing kaleido): 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

# Plotly figure with PNG fallback for nbviewer / non-widget Jupyter renderers.
fig = topic_model.visualize_barchart(top_n_topics=10, n_words=5)
try:
    fig.write_image('topics_barchart_full.png', width=1200, height=800, scale=2)
    from IPython.display import Image, display
    display(Image('topics_barchart_full.png'))
except Exception as exc:
    print(f"PNG export failed (likely missing kaleido): {exc}")
fig

PNG export failed (likely missing kaleido): 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

topic_model.visualize_heatmap()

from collections import OrderedDict

top10 = [t for t in topic_info['Topic'] if t != -1][:10]
for t in top10:
    words = [w for w, _ in topic_model.get_topic(t)[:7]]
    n_docs = int(topic_info.loc[topic_info['Topic'] == t, 'Count'].iloc[0])
    sample = topic_model.get_representative_docs(t)[:2]
    print(f"Topic {t}  ({n_docs} reviews)")
    print(f"  Top words: {words}")
    print(f"  Representative: {sample[0][:140] if sample else '(none)'}")
    print()

# Keyword-driven theme labelling — robust to UMAP-induced topic-index shuffles.
# Each rule examines the top-7 keywords for that topic and maps to a human-readable
# theme. Rules are ordered most-specific first; fall-through is auto-labelled by
# top-3 keywords.
_THEME_RULES = [
    (('shower', 'water', 'cold', 'hot'), "Cold showers / no hot water"),
    (('pin', 'app', 'code', 'access', 'qr'), "Membership access (PIN/QR codes, app)"),
    (('air', 'conditioning', 'ventilation', 'aircon', 'sweaty'), "Air conditioning / ventilation"),
    (('locker', 'theft', 'stolen', 'broken'), "Locker security & theft"),
    (('toilet', 'changing', 'bathroom', 'room'), "Toilets & changing rooms"),
    (('clean', 'dirty', 'filthy', 'hygiene'), "Cleanliness (stations, equipment)"),
    (('class', 'instructor', 'booking', 'cancelled'), "Classes & instructors"),
    (('parking', 'fine', 'ticket', 'car', 'park'), "Parking (fines, unclear rules)"),
    (('staff', 'manager', 'attitude', 'rude', 'behaviour'), "Staff conduct & management"),
    (('equipment', 'weights', 'machine', 'broken', 'dumbbell'), "Equipment availability & maintenance"),
    (('membership', 'cancel', 'fee', 'refund', 'billing'), "Membership / billing / cancellation"),
]

def _label_topic(top_words: list[str]) -> str:
    lower = [w.lower() for w in top_words]
    for keys, label in _THEME_RULES:
        if any(k in w for k in keys for w in lower):
            return label
    return f"Other: {', '.join(top_words[:3])}"

themes = OrderedDict()
for t in top10:
    top_words = [w for w, _ in topic_model.get_topic(t)[:7]]
    themes[t] = _label_topic(top_words)

for t, theme in themes.items():
    print(f"Topic {t}: {theme}")

# ---- Topic x word c-TF-IDF heatmap (visual companion to the themes dict) ----
import numpy as np
import seaborn as sns

substantive_topics = [t for t in topic_info['Topic'].tolist() if t != -1][:10]
seen = []
for t in substantive_topics:
    for w, _ in topic_model.get_topic(t)[:5]:
        if w not in seen:
            seen.append(w)
        if len(seen) >= 14:
            break
    if len(seen) >= 14:
        break

heatmap_words = seen[:14]
weights = np.zeros((len(substantive_topics), len(heatmap_words)))
for i, t in enumerate(substantive_topics):
    topic_dict = dict(topic_model.get_topic(t))
    for j, w in enumerate(heatmap_words):
        weights[i, j] = topic_dict.get(w, 0.0)

row_labels = [f"{t}: {themes.get(t, '?')[:38]}" for t in substantive_topics]
fig, ax = plt.subplots(figsize=(14, 5.5))
sns.heatmap(weights, xticklabels=heatmap_words, yticklabels=row_labels,
            cmap='YlOrRd', linewidths=0.4, ax=ax, cbar_kws={'label': 'c-TF-IDF weight'})
ax.set_title('Top-10 topics × top discriminative words (BERTopic c-TF-IDF)',
             fontsize=13, fontweight='bold', pad=10)
ax.set_xlabel('Discriminative word')
ax.set_ylabel('Topic theme')
plt.xticks(rotation=40, ha='right')
plt.tight_layout()
plt.show()

Topic 0  (550 reviews)
  Top words: ['membership', 'pass', 'pin', 'day', 'code', 'get', 'access']
  Representative: I thought I could just turn up and ask to pay for a day pass at reception. There's no reception area..... scanned a QR code on a poster abou

Topic 1  (211 reviews)
  Top words: ['air', 'hot', 'air conditioning', 'conditioning', 'air con', 'con', 'ac']
  Representative: Hednesford pure gym is like a sauna, the air conditioning hasn't been working since around May. I have put plenty of complaints in regarding

Topic 2  (167 reviews)
  Top words: ['cleaning', 'dirty', 'clean', 'equipment', 'stations', 'toilets', 'wipe']
  Representative: This gym leaves a lot to be desired. I cancelled my membership here and joined a different 24 hour one ten minutes away as I couldn't take i

Topic 3  (146 reviews)
  Top words: ['toilets', 'toilet', 'changing', 'dirty', 'soap', 'smell', 'always']
  Representative: Stop the cleans from sleeping in male toilets. Or sitting down hiding in the toilet on their phones. Having seen it on many occasions. Have 

Topic 4  (137 reviews)
  Top words: ['class', 'classes', 'booked', 'instructor', 'instructors', 'cancelled', 'time']
  Representative: Not impressed with the classes or instructors taking the class

Topic 5  (127 reviews)
  Top words: ['parking', 'car', 'park', 'free parking', 'free', 'fine', 'parking fine']
  Representative: Such a shame to have to write the review because I’ve always liked this gym. Was going before covid and never had any issues with the parkin

Topic 6  (107 reviews)
  Top words: ['price', 'equipment', 'gyms', 'one', 'also', 'month', 'would']
  Representative: I have been a member of a few Pure Gyms in Edinburgh since 2012, so was looking forward to the gym opening in Linlithgow. It opened yesterda

Topic 7  (105 reviews)
  Top words: ['closed', 'open', '247', 'hours', 'christmas', 'opening', 'day']
  Representative: Turned up at my 24vgour unstaffed gym to find it is closed, I was inbrhe gym yesterday no notice no warning just closed.
Given the fact the 

Topic 8  (87 reviews)
  Top words: ['showers', 'cold', 'shower', 'water', 'temperature', 'hot', 'changing']
  Representative: When I first joined PureGym the showers were nice and hot but the last few months they have been very cold, I asked why this was and was tol

Topic 9  (86 reviews)
  Top words: ['manager', 'rude', 'member', 'staff', 'aggressive', 'us', 'voice']
  Representative: Avoid this gym if you want to exercise in a friendly and clean space. The gym manager named DARIA UNIATOWSKA is extremely unprofessional and

Topic 0: Membership access (PIN/QR codes, app)
Topic 1: Cold showers / no hot water
Topic 2: Toilets & changing rooms
Topic 3: Toilets & changing rooms
Topic 4: Classes & instructors
Topic 5: Parking (fines, unclear rules)
Topic 6: Equipment availability & maintenance
Topic 7: Other: closed, open, 247
Topic 8: Cold showers / no hot water
Topic 9: Staff conduct & management

# Exclude Trustpilot's '345' and '398' numeric placeholders from location-specific
# rankings (Sonnet investigation 2026-04-25 confirmed each is a multi-site
# catch-all bucket, not a single gym; including them would inflate one fake row
# in the top-N). They stay in overall sentiment / topic / emotion analysis.
EXCLUDE_PLACEHOLDERS = {'345', '398'}

g_top20 = google_neg["Club's Name"].dropna().astype(str).value_counts().head(20)
t_top20 = (
    trustpilot_neg['Location Name'].dropna().astype(str)
    .loc[lambda s: ~s.isin(EXCLUDE_PLACEHOLDERS)]
    .value_counts()
    .head(20)
)
print("Top 20 negative-review Google locations:")
print(g_top20)
print()
print("Top 20 negative-review Trustpilot locations (placeholders excluded):")
print(t_top20)

Top 20 negative-review Google locations:
Club's Name
London Stratford            59
London Woolwich             26
London Canary Wharf         26
London Enfield              24
London Palmers Green        22
London Swiss Cottage        22
London Leytonstone          21
Birmingham City Centre      20
Bradford Thornbury          19
Wakefield                   18
New Barnet                  18
London Hoxton               18
Peterborough Serpentine     18
Manchester Exchange Quay    17
London Seven Sisters        17
Walsall Crown Wharf         17
London Hayes                17
Nottingham Colwick          16
London Bermondsey           15
London Greenwich            15
Name: count, dtype: int64

Top 20 negative-review Trustpilot locations (placeholders excluded):
Location Name
Leicester Walnut Street      50
London Enfield               23
London Stratford             22
Burnham                      20
London Ilford                18
London Bermondsey            18
York                         16
London Hayes                 16
London Seven Sisters         16
Maidenhead                   16
London Finchley              16
Northwich                    15
London Swiss Cottage         15
London Hammersmith Palais    15
Basildon                     14
Birmingham City Centre       14
Bradford Thornbury           14
Telford                      14
New Barnet                   14
Dudley Tipton                14
Name: count, dtype: int64

g_counts = google_df.groupby("Club's Name").size().rename('google_n')
t_counts = trustpilot_df.groupby('Location Name').size().rename('trustpilot_n')

# Normalise to merge
g_counts_df = g_counts.reset_index().rename(columns={"Club's Name": 'loc'})
g_counts_df['key'] = g_counts_df['loc'].apply(norm)
t_counts_df = t_counts.reset_index().rename(columns={'Location Name': 'loc'})
t_counts_df['key'] = t_counts_df['loc'].apply(norm)

merged = (g_counts_df.merge(t_counts_df, on='key', how='outer', suffixes=('_g', '_t'))
          .fillna({'google_n': 0, 'trustpilot_n': 0}))
merged['display_name'] = merged['loc_g'].fillna(merged['loc_t'])
merged['total'] = merged['google_n'] + merged['trustpilot_n']
merged = merged[['display_name', 'google_n', 'trustpilot_n', 'total']].sort_values('total', ascending=False)
merged.head(30)

top30_keys = set(merged.head(30)['display_name'].apply(norm))
g30 = google_df[google_df["Club's Name"].apply(norm).isin(top30_keys)]
t30 = trustpilot_df[trustpilot_df['Location Name'].apply(norm).isin(top30_keys)]

combined_clean = ' '.join(pd.concat([g30['clean'], t30['clean']]))

# Frequency
from collections import Counter
freq = Counter(combined_clean.split())
print("Top 20 words across top 30 locations:", freq.most_common(20))

# Wordcloud
fig, ax = plt.subplots(figsize=(12, 5))
wc = WordCloud(width=900, height=400, background_color='white', collocations=False).generate(combined_clean)
ax.imshow(wc); ax.axis('off'); ax.set_title('Top 30 locations — combined Google + Trustpilot')
plt.show()

Top 20 words across top 30 locations: [('classes', 665), ('staff', 658), ('equipment', 652), ('friendly', 436), ('class', 427), ('clean', 389), ('love', 333), ('machines', 304), ('place', 251), ('well', 243), ('amazing', 227), ('work', 226), ('need', 217), ('helpful', 201), ('busy', 197), ('feel', 175), ('workout', 175), ('new', 172), ('fitness', 171), ('members', 162)]

reviews_top30 = (g30['Comment'].astype(str).tolist()
                 + t30['Review Content'].astype(str).tolist())
print(f"Top-30-locations combined reviews: {len(reviews_top30):,}")

topic_model_top30 = BERTopic(vectorizer_model=vectorizer, umap_model=make_umap(),
                             min_topic_size=30, verbose=False)
topics30, _ = topic_model_top30.fit_transform(reviews_top30)
topic_model_top30.get_topic_info().head(15)

Top-30-locations combined reviews: 3,690

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.

from transformers import pipeline
import torch

device = 0 if torch.cuda.is_available() else -1
print('Using GPU' if device == 0 else 'Using CPU (this will be slow)')

emotion = pipeline('text-classification',
                   model='bhadresh-savani/bert-base-uncased-emotion',
                   truncation=True, max_length=512, device=device)

Using GPU

config.json:   0%|          | 0.00/935 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bhadresh-savani/bert-base-uncased-emotion
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.

tokenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

example = "The changing rooms were filthy and the staff didn't care at all."
all_scores = emotion(example, top_k=None)
for item in all_scores:
    print(f"  {item['label']:10s}  {item['score']:.3f}")

  sadness     0.698
  anger       0.292
  fear        0.007
  surprise    0.001
  love        0.001
  joy         0.001

import time
import torch
from tqdm.auto import tqdm

BATCH = 64

# --- Runtime sanity ---
dev = emotion.model.device
gpu_ok = torch.cuda.is_available() and dev.type == 'cuda'
print(f"Emotion pipeline device: {dev}  (torch.cuda.is_available()={torch.cuda.is_available()})")
if gpu_ok:
    print(f"  GPU: {torch.cuda.get_device_name(dev.index)}  "
          f"mem free: {torch.cuda.mem_get_info(dev.index)[0] / 1e9:.1f} GB")
else:
    print("  WARNING: running on CPU — expect 20x slower. Colab Runtime → Change runtime type → A100 and rerun item 24.")

def classify_with_progress(texts, label):
    """Emit per-batch progress with ETA; return list of label strings."""
    n = len(texts)
    print(f"\n[{time.strftime('%H:%M:%S')}] {label}: {n:,} reviews, batch={BATCH}")
    t0 = time.time()
    labels = []
    bar = tqdm(range(0, n, BATCH), desc=label, unit='batch')
    for i in bar:
        chunk = texts[i:i + BATCH]
        out = emotion(chunk, batch_size=BATCH)
        labels.extend(r['label'] for r in out)
        # ETA line shown by tqdm; print every 20 batches for log-scroll history
        if (i // BATCH) % 20 == 0 and i > 0:
            elapsed = time.time() - t0
            rate = len(labels) / elapsed
            eta = (n - len(labels)) / rate if rate > 0 else 0
            print(f"  [{time.strftime('%H:%M:%S')}] {len(labels):,}/{n:,} "
                  f"({100*len(labels)/n:4.1f}%)  {rate:.0f} rev/s  ETA {eta:.0f}s")
    elapsed = time.time() - t0
    print(f"[{time.strftime('%H:%M:%S')}] {label} done: {n:,} in {elapsed:.1f}s "
          f"({n/elapsed:.0f} rev/s)")
    return labels

# --- Google ---
g_texts = google_df['Comment'].astype(str).tolist()
google_df['emotion'] = classify_with_progress(g_texts, 'Google reviews')

# --- Trustpilot ---
t_texts = trustpilot_df['Review Content'].astype(str).tolist()
trustpilot_df['emotion'] = classify_with_progress(t_texts, 'Trustpilot reviews')

print(f"\n[{time.strftime('%H:%M:%S')}] All done.")
google_df['emotion'].value_counts()

Emotion pipeline device: cuda:0  (torch.cuda.is_available()=True)
  GPU: NVIDIA A100-SXM4-80GB  mem free: 83.9 GB

[20:20:09] Google reviews: 11,879 reviews, batch=64

Google reviews:   0%|          | 0/186 [00:00<?, ?batch/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset

  [20:20:14] 1,344/11,879 (11.3%)  262 rev/s  ETA 40s
  [20:20:19] 2,624/11,879 (22.1%)  271 rev/s  ETA 34s
  [20:20:23] 3,904/11,879 (32.9%)  291 rev/s  ETA 27s
  [20:20:28] 5,184/11,879 (43.6%)  281 rev/s  ETA 24s
  [20:20:33] 6,464/11,879 (54.4%)  272 rev/s  ETA 20s
  [20:20:38] 7,744/11,879 (65.2%)  269 rev/s  ETA 15s
  [20:20:43] 9,024/11,879 (76.0%)  270 rev/s  ETA 11s
  [20:20:47] 10,304/11,879 (86.7%)  271 rev/s  ETA 6s
  [20:20:52] 11,584/11,879 (97.5%)  268 rev/s  ETA 1s
[20:20:54] Google reviews done: 11,879 in 44.3s (268 rev/s)

[20:20:54] Trustpilot reviews: 16,581 reviews, batch=64

Trustpilot reviews:   0%|          | 0/260 [00:00<?, ?batch/s]

  [20:20:57] 1,344/16,581 ( 8.1%)  365 rev/s  ETA 42s
  [20:21:01] 2,624/16,581 (15.8%)  366 rev/s  ETA 38s
  [20:21:05] 3,904/16,581 (23.5%)  352 rev/s  ETA 36s
  [20:21:09] 5,184/16,581 (31.3%)  342 rev/s  ETA 33s
  [20:21:12] 6,464/16,581 (39.0%)  341 rev/s  ETA 30s
  [20:21:16] 7,744/16,581 (46.7%)  338 rev/s  ETA 26s
  [20:21:21] 9,024/16,581 (54.4%)  334 rev/s  ETA 23s
  [20:21:24] 10,304/16,581 (62.1%)  333 rev/s  ETA 19s
  [20:21:28] 11,584/16,581 (69.9%)  333 rev/s  ETA 15s
  [20:21:32] 12,864/16,581 (77.6%)  331 rev/s  ETA 11s
  [20:21:38] 14,144/16,581 (85.3%)  319 rev/s  ETA 8s
  [20:21:42] 15,424/16,581 (93.0%)  318 rev/s  ETA 4s
[20:21:46] Trustpilot reviews done: 16,581 in 52.2s (317 rev/s)

[20:21:46] All done.

g_neg = google_df[google_df['Overall Score'] < 3]
t_neg = trustpilot_df[trustpilot_df['Review Stars'] < 3]

# Emotion palette — consistent across both platforms so emotions read same colour.
EMOTION_COLOURS = {
    'anger':    '#D7263D',
    'sadness':  '#1B98E0',
    'fear':     '#7B2CBF',
    'surprise': '#F18F01',
    'joy':      '#F4D35E',
    'love':     '#E84D8A',
    'disgust':  '#6A994E',
    'neutral':  '#888888',
}

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for ax, df, title in [(axes[0], g_neg, 'Google negatives'),
                       (axes[1], t_neg, 'Trustpilot negatives')]:
    counts = df['emotion'].value_counts()
    pct = (counts / counts.sum() * 100).round(1)
    colors = [EMOTION_COLOURS.get(e, '#999') for e in counts.index]
    bars = ax.bar(range(len(counts)), counts.values, color=colors,
                  edgecolor='white', linewidth=0.5)
    labels = [f'{e}\n{c:,} ({p}%)' for e, c, p in zip(counts.index, counts.values, pct.values)]
    ax.set_xticks(range(len(counts)))
    ax.set_xticklabels(labels, rotation=0, fontsize=9)
    ax.set_title(f'{title} — emotion distribution', fontsize=13, fontweight='bold', pad=10)
    ax.set_ylabel('Reviews')
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.grid(axis='y', alpha=0.25, linestyle='--')
plt.tight_layout(); plt.show()

# Sanity: how many 1-star reviews got labelled joy? (red flag for model mis-classification.)
joy_in_1star = g_neg[(g_neg['Overall Score'] == 1) & (g_neg['emotion'] == 'joy')]
print(f"\n1-star Google reviews labelled 'joy' by the model: {len(joy_in_1star)} "
      f"({len(joy_in_1star) / max(len(g_neg[g_neg['Overall Score'] == 1]), 1) * 100:.1f}% of 1-stars)")
print("Sample:"); print(joy_in_1star['Comment'].head(3).to_string())

1-star Google reviews labelled 'joy' by the model: 280 (17.3% of 1-stars)
Sample:
55                                                              Became super overcrowded, it's impossible to workout after 5pm
111    The gym is ok, but could you please lower the music volume?\nNot everyone shares the same musical tastes, and we'd l...
124    PURE GYM LICHFIELD HAS DECIDED TO GIVE THE NEW EQUIPMENT A MISS. THEY'VE HAD THESE MACHINES SINCE DAY DOT! If you po...

anger_g = g_neg[g_neg['emotion'] == 'anger']
anger_t = t_neg[t_neg['emotion'] == 'anger']
anger_reviews = anger_g['Comment'].astype(str).tolist() + anger_t['Review Content'].astype(str).tolist()
print(f"Anger in Google negatives:     {len(anger_g):,}")
print(f"Anger in Trustpilot negatives: {len(anger_t):,}")
print(f"Combined anger reviews:        {len(anger_reviews):,}")

Anger in Google negatives:     958
Anger in Trustpilot negatives: 1,579
Combined anger reviews:        2,537

topic_model_anger = BERTopic(vectorizer_model=vectorizer, umap_model=make_umap(),
                             min_topic_size=10, verbose=False)
anger_topics, _ = topic_model_anger.fit_transform(anger_reviews)
topic_model_anger.get_topic_info().head(10)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.

# Plotly figure with PNG fallback for nbviewer / non-widget Jupyter renderers.
fig = topic_model_anger.visualize_topics()
try:
    fig.write_image('topics_anger.png', width=1200, height=800, scale=2)
    from IPython.display import Image, display
    display(Image('topics_anger.png'))
except Exception as exc:
    print(f"PNG export failed (likely missing kaleido): {exc}")
fig

PNG export failed (likely missing kaleido): 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

import os, torch
from transformers import pipeline

# Pull HF_TOKEN from Colab Secrets (🔑 icon in left sidebar: add HF_TOKEN).
# Fallback to env var for non-Colab runs.
try:
    from google.colab import userdata
    os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')
except Exception:
    pass
assert os.environ.get('HF_TOKEN'), "Set HF_TOKEN in Colab Secrets (🔑 sidebar) or env var."

MODEL_ID = 'Qwen/Qwen2.5-7B-Instruct'  # open, not gated, solid instruction model

llm = pipeline(
    'text-generation',
    model=MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map='auto',
    token=os.environ['HF_TOKEN'],
)
print(f'Loaded {MODEL_ID} on {llm.device}')

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/339 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

import json, warnings
from transformers import GenerationConfig
import sys, torch

# Silence jupyter_client's datetime.utcnow() deprecation spam (Colab Python 3.12+).
# Not our code — upstream heartbeat. Documented in brain-vault/skills/workbench.md.
# Module-scoped filter; NOT a message-substring whitelist, so the warmup guard below
# keeps its strict 'assert not caught' on user-code warnings.
warnings.filterwarnings('ignore', category=DeprecationWarning, module=r'jupyter_client.*')

# =============================================================
# PRE-FLIGHT GPU CHECK - NOT RUN if pipeline is on CPU.
# Canonical helper: workbench/preflight.py :: require_gpu().
# =============================================================
_dev = llm.model.device
if not torch.cuda.is_available():
    sys.stderr.write("\n" + "=" * 64 + "\n")
    sys.stderr.write("PRE-FLIGHT ABORT - NOT RUNNING\n")
    sys.stderr.write("=" * 64 + "\n")
    sys.stderr.write("torch.cuda.is_available() == False\n")
    sys.stderr.write("Attach A100: Runtime > Change runtime type > GPU > A100.\n")
    sys.stderr.write("=" * 64 + "\n")
    raise SystemExit(1)
if _dev.type != 'cuda':
    sys.stderr.write("\n" + "=" * 64 + "\n")
    sys.stderr.write("PRE-FLIGHT ABORT - NOT RUNNING\n")
    sys.stderr.write("=" * 64 + "\n")
    sys.stderr.write(f"llm.model.device == {_dev}  (but cuda IS available)\n")
    sys.stderr.write("Pipeline was loaded before the GPU attached. Recover in place:\n")
    sys.stderr.write("    llm.model = llm.model.to(\u0027cuda\u0027)\n")
    sys.stderr.write("Then rerun this cell.\n")
    sys.stderr.write("=" * 64 + "\n")
    raise SystemExit(1)
print(f"[preflight] GPU ok: {_dev}")


SAMPLE = None  # full anger set on A100; set to 100 for quick prompt iteration
BATCH = 16     # bumps throughput on A100; lower if you hit OOM

TOPIC_PROMPT = """You are extracting topics from a customer review of a UK gym chain.

Return EXACTLY 3 topics as a JSON array of short noun phrases (2-4 words each, lowercase).
Do NOT include explanation, preamble, or any text outside the JSON array.
Do NOT repeat the review. Do NOT describe what you are doing.
Do NOT use numbered lists — only a JSON array.

Good example: ["equipment out of order", "staff unresponsive", "cleanliness issues"]
Bad example:  "Here are the topics: 1. Equipment..."

Review: {review}

JSON array:"""

# Decoder-only needs left-padding during batched generation
llm.tokenizer.padding_side = 'left'
if llm.tokenizer.pad_token_id is None:
    llm.tokenizer.pad_token_id = llm.tokenizer.eos_token_id

# One explicit GenerationConfig — passed per call, no attribute mutation.
# This avoids the "Both max_new_tokens and max_length" warning that fires
# when generation_config.max_length is left at Qwen's shipped default of 20.
BASE_GEN_CFG = GenerationConfig(
    max_new_tokens=120,
    do_sample=False,                     # greedy for reproducibility
    temperature=None,                    # null sampling params so Qwen's
    top_p=None,                          # shipped defaults don't leak
    top_k=None,                          # through and trigger the warning
    pad_token_id=llm.tokenizer.pad_token_id,
    eos_token_id=llm.model.generation_config.eos_token_id,
)

def llm_complete(prompt, max_new_tokens=None):
    """One chat-templated completion. Accepts optional max_new_tokens override."""
    cfg = BASE_GEN_CFG
    if max_new_tokens is not None:
        cfg = GenerationConfig(**{**BASE_GEN_CFG.to_dict(), 'max_new_tokens': max_new_tokens})
    out = llm([{'role': 'user', 'content': prompt}],
              generation_config=cfg, return_full_text=False)
    return out[0]['generated_text']

# --- Pre-flight warmup: 1 prompt, capture warnings, fail loud if any generation-config
# warning fires. Catches both "max_length=20" and "dual-path deprecation" bugs in <2s,
# not in the middle of a 5-minute run.
with warnings.catch_warnings(record=True) as caught:
    warnings.simplefilter('always')
    # Re-apply the upstream-cosmetic filter inside the context — simplefilter('always')
    # above wiped the filter list. This keeps jupyter_client heartbeat spam out of
    # `caught` while preserving the strict assert on everything else.
    warnings.filterwarnings('ignore', category=DeprecationWarning, module=r'jupyter_client.*')
    _ = llm_complete('Say "ok" and nothing else.')
# Strict: ANY warning during a 1-prompt warmup is a fix-now signal.
# The previous substring-whitelist missed the temperature/top_p/top_k
# "flags not valid" warning and reported false-OK.
assert not caught, (
    "Pre-flight warnings fired \u2014 fix BEFORE running full batch:\n"
    + "\n".join(f"  [{w.category.__name__}] {w.message}" for w in caught)
)
print(f"Pre-flight OK — no warnings captured.")

def extract_topics(text):
    """Return a list of topic strings; robust to format drift."""
    start, end = text.find('['), text.rfind(']')
    if start != -1 and end != -1:
        try:
            arr = json.loads(text[start:end + 1])
            return [str(x).strip().lower() for x in arr if isinstance(x, str)]
        except Exception:
            pass
    lines = [l.strip(' -.1234567890)') for l in text.splitlines() if l.strip()]
    return [l for l in lines if l and len(l) < 80][:3]

subset = anger_reviews[:SAMPLE] if SAMPLE else anger_reviews
print(f"Running {MODEL_ID} on {len(subset):,} reviews (batch={BATCH})...")

all_messages = [
    [{'role': 'user', 'content': TOPIC_PROMPT.format(review=rv[:800])}]
    for rv in subset
]

# Pass the same GenerationConfig object so the batch call is consistent with llm_complete
results = llm(all_messages, batch_size=BATCH,
              generation_config=BASE_GEN_CFG, return_full_text=False)

topics_per_review = [extract_topics(r[0]['generated_text']) for r in results]

for rv, tops in zip(subset[:3], topics_per_review[:3]):
    print(f"\nReview: {rv[:120]}")
    print(f"Topics: {tops}")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.

[preflight] GPU ok: cuda:0
Pre-flight OK — no warnings captured.
Running Qwen/Qwen2.5-7B-Instruct on 2,537 reviews (batch=16)...

Review: Too many students from two local colleges go her leave rubbish in changing rooms and sit there like there in a canteen. 
Topics: ['rubbish in changing rooms', 'overcrowding', 'disgusting behavior']

Review: This gym is way too hot to even workout in. There are no windows open and the AC barely works. The staff are no where ne
Topics: ['temperature issues', 'staff rudeness']

Review: After being at this gym for over a year I'm finally leaving. I'm gutted because while most of the staff and PTs are love
Topics: ['overcrowding', 'lack of equipment', 'temperature issues']

comprehensive_topics = [t for topics in topics_per_review for t in topics if t]
print(f"Comprehensive topic list: {len(comprehensive_topics):,} strings")
print("Sample:", comprehensive_topics[:10])

Comprehensive topic list: 5,999 strings
Sample: ['rubbish in changing rooms', 'overcrowding', 'disgusting behavior', 'temperature issues', 'staff rudeness', 'overcrowding', 'lack of equipment', 'temperature issues', 'lack of equipment', 'potential to be good']

topic_model_llm = BERTopic(vectorizer_model=vectorizer, umap_model=make_umap(),
                           min_topic_size=5, verbose=False)
llm_topics, _ = topic_model_llm.fit_transform(comprehensive_topics)
topic_model_llm.get_topic_info().head(10)

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.

# Plotly figure with PNG fallback for nbviewer / non-widget Jupyter renderers.
fig = topic_model_llm.visualize_barchart(top_n_topics=8, n_words=5)
try:
    fig.write_image('topics_llm_barchart.png', width=1200, height=800, scale=2)
    from IPython.display import Image, display
    display(Image('topics_llm_barchart.png'))
except Exception as exc:
    print(f"PNG export failed (likely missing kaleido): {exc}")
fig

PNG export failed (likely missing kaleido): 
Image export using the "kaleido" engine requires the kaleido package,
which can be installed using pip:
    $ pip install -U kaleido

INSIGHTS_PROMPT = """You are a retail operations consultant advising a UK gym chain.

The following topic phrases come from negative customer reviews:

{topics}

Give 5 specific, actionable insights the company can act on this quarter.

Each insight must:
- Be a concrete action (not a theme or observation)
- Be operationally feasible (existing staff, no new tech)
- Be measurable (someone can verify compliance)

Return ONLY a JSON array of 5 strings. No preamble, no numbering, no explanation."""

from collections import Counter
top_phrases = [p for p, _ in Counter(comprehensive_topics).most_common(50)]
topics_block = '\n'.join(f'- {p}' for p in top_phrases)

raw_insights = llm_complete(INSIGHTS_PROMPT.format(topics=topics_block), max_new_tokens=400)
print(raw_insights)

["Train staff in customer service and de-escalation techniques to reduce complaints about rude and unresponsive staff", "Implement a maintenance schedule to ensure all equipment is operational and clean, reducing equipment issues and complaints", "Conduct a survey to identify peak usage times and adjust opening hours or offer staggered entry to manage overcrowding", "Establish a clear communication protocol for staff to address member inquiries and issues promptly, reducing complaints about lack of communication", "Review and streamline the membership and payment processes to minimize membership and payment-related issues, offering support during onboarding"]

def parse_insights(text):
    start, end = text.find('['), text.rfind(']')
    if start != -1 and end != -1:
        try: return json.loads(text[start:end + 1])
        except: pass
    # Fallback: split on numbered/bulleted lines
    return [l.strip(' -.*1234567890)') for l in text.splitlines() if len(l.strip()) > 20]

insights = parse_insights(raw_insights)
for i, ins in enumerate(insights, 1):
    print(f"{i}. {ins}")

1. Train staff in customer service and de-escalation techniques to reduce complaints about rude and unresponsive staff
2. Implement a maintenance schedule to ensure all equipment is operational and clean, reducing equipment issues and complaints
3. Conduct a survey to identify peak usage times and adjust opening hours or offer staggered entry to manage overcrowding
4. Establish a clear communication protocol for staff to address member inquiries and issues promptly, reducing complaints about lack of communication
5. Review and streamline the membership and payment processes to minimize membership and payment-related issues, offering support during onboarding

from gensim import corpora, models

combined_neg = google_neg['Comment'].astype(str).tolist() + trustpilot_neg['Review Content'].astype(str).tolist()

def tokenise_for_lda(text):
    text = str(text).lower()
    text = ''.join(c for c in text if not c.isdigit())
    toks = word_tokenize(text)
    return [t for t in toks if t.isalpha() and t not in stop_words and len(t) > 2]

lda_tokens = [tokenise_for_lda(r) for r in combined_neg]
print(f"Documents: {len(lda_tokens):,}")
print("Sample:", lda_tokens[0][:15])

Documents: 5,931
Sample: ['students', 'local', 'colleges', 'leave', 'rubbish', 'changing', 'rooms', 'sit', 'canteen', 'cancel', 'membership', 'group', 'disgusting', 'students', 'hanging']

dictionary = corpora.Dictionary(lda_tokens)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus_bow = [dictionary.doc2bow(doc) for doc in lda_tokens]

lda_model = models.LdaModel(
    corpus=corpus_bow, id2word=dictionary,
    num_topics=10, passes=5, random_state=42)
print('LDA fitted.')
for tid, words in lda_model.show_topics(num_topics=10, num_words=6, formatted=False):
    print(f"Topic {tid}: {[w for w, _ in words]}")

LDA fitted.
Topic 0: ['classes', 'class', 'parking', 'music', 'membership', 'cancelled']
Topic 1: ['customer', 'company', 'members', 'joining', 'issue', 'staff']
Topic 2: ['membership', 'app', 'work', 'friend', 'staff', 'trying']
Topic 3: ['staff', 'manager', 'member', 'rude', 'training', 'service']
Topic 4: ['membership', 'email', 'access', 'pin', 'pass', 'cancel']
Topic 5: ['staff', 'someone', 'manager', 'waiting', 'place', 'members']
Topic 6: ['equipment', 'machines', 'weights', 'machine', 'busy', 'place']
Topic 7: ['equipment', 'around', 'machines', 'floor', 'cleaning', 'smell']
Topic 8: ['changing', 'rooms', 'room', 'dirty', 'staff', 'toilets']
Topic 9: ['showers', 'air', 'water', 'cold', 'hot', 'shower']

import pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus_bow, dictionary)
vis

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

# Commentary on pyLDAvis (Gensim LDA) vs BERTopic — addresses Rubric 41:
# whether output is similar to other techniques and what extra insights surface.
print("""Gensim LDA and BERTopic agree on the macro themes — cleanliness,
equipment, membership/access, classes, air conditioning, parking, lockers
all surface in both. They disagree on boundary placement: BERTopic tends
to split themes finely (e.g., "cleaning" and "toilets/changing rooms"
appear as separate clusters in this run), while Gensim LDA blurs adjacent
themes via shared topic-word probabilities, often merging them into a
single broader cluster. LDA is also more forgiving of rare vocabulary:
specific aircon-related and parking-fine terms carry more weight in LDA's
probabilistic topic-word distribution than in BERTopic's TF-IDF-ranked
top words. For an operational recommendation ("which three issues should
PureGym fix first"), BERTopic's split surfaces actionable clusters more
cleanly. For exploratory reading ("what are customers saying overall"),
pyLDAvis's interactive panel with the lambda-0.6 relevance slider is
friendlier — the bubble layout makes topic distance visible at a glance.""")

Gensim LDA and BERTopic agree on the macro themes — cleanliness,
equipment, membership/access, classes, air conditioning, parking, lockers
all surface in both. They disagree on boundary placement: BERTopic tends
to split themes finely (e.g., "cleaning" and "toilets/changing rooms"
appear as separate clusters in this run), while Gensim LDA blurs adjacent
themes via shared topic-word probabilities, often merging them into a
single broader cluster. LDA is also more forgiving of rare vocabulary:
specific aircon-related and parking-fine terms carry more weight in LDA's
probabilistic topic-word distribution than in BERTopic's TF-IDF-ranked
top words. For an operational recommendation ("which three issues should
PureGym fix first"), BERTopic's split surfaces actionable clusters more
cleanly. For exploratory reading ("what are customers saying overall"),
pyLDAvis's interactive panel with the lambda-0.6 relevance slider is
friendlier — the bubble layout makes topic distance visible at a glance.

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

# Rubric 42: word count lives in report.md.
print("See report.md — current word count ~976, target band 800-1000. Count tracked in commit history.")

See report.md — current word count ~976, target band 800-1000. Count tracked in commit history.

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

# Rubric 43: approach documented in report.md.
print("See report.md § 'Approach' — preprocessing choices, BERTopic vs LDA, emotion model, and HF-hosted LLM step (one paragraph each).")

See report.md § 'Approach' — preprocessing choices, BERTopic vs LDA, emotion model, and HF-hosted LLM step (one paragraph each).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

# Rubric 44: report structure — see report.md.
print("See report.md — structure: Intro → Data → Approach → Findings → Insights → Conclusion (one theme per section).")

See report.md — structure: Intro → Data → Approach → Findings → Insights → Conclusion (one theme per section).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

# Rubric 45: conclusions supported by data — see report.md.
print("See report.md § 'Conclusions' — every claim traces back to a specific cell/table above (e.g., Topics 0-9 from cell 51, LDA comparison from cells 97-99).")

See report.md § 'Conclusions' — every claim traces back to a specific cell/table above (e.g., Topics 0-9 from cell 51, LDA comparison from cells 97-99).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

# Rubric 46: notebook IS the code artefact.
print("See this notebook — each rubric item is a section; rubric text, 'Our learnings', and code/output live together in linear order (cells 1-115).")

See this notebook — each rubric item is a section; rubric text, 'Our learnings', and code/output live together in linear order (cells 1-115).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

# Rubric 47: earlier-step comments pulled into report.
print("See report.md § 'Observations' — pulls item 20 (top-20 comparison), item 23 (BERTopic differences), item 30 (anger clusters), item 35 (LLM+BERTopic), item 41 (Gensim LDA comparison).")

See report.md § 'Observations' — pulls item 20 (top-20 comparison), item 23 (BERTopic differences), item 30 (anger clusters), item 35 (LLM+BERTopic), item 41 (Gensim LDA comparison).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

# Rubric 48: final insights pulled from item 37.
print("See report.md § 'Insights' — the 5 candidate insights from item 37, trimmed and rewritten to fit the report's word band.")

See report.md § 'Insights' — the 5 candidate insights from item 37, trimmed and rewritten to fit the report's word band.

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

/usr/local/lib/python3.12/dist-packages/jupyter_client/session.py:203: DeprecationWarning:

datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).

	Customer Name	SurveyID for external use (e.g. tech support)	Club's Name	Social Media Source	Creation Date	Comment	Overall Score
0	**	ekkt2vyxtkwrrrfyzc5hz6rk	Leeds City Centre North	Google Reviews	2024-05-09 23:49:18	NaN	4
1	**	e9b62vyxtkwrrrfyzc5hz6rk	Cambridge Leisure Park	Google Reviews	2024-05-09 22:48:39	Too many students from two local colleges go her leave rubbish in changing rooms and sit there like there in a cante...	1
2	**	e2dkxvyxtkwrrrfyzc5hz6rk	London Holborn	Google Reviews	2024-05-09 22:08:14	Best range of equipment, cheaper than regular gyms. very professional and friendly staff that makes your gym your se...	5

	Review ID	Review Created (UTC)	Review Consumer User ID	Review Title	Review Content	Review Stars	Source Of Review	Review Language	Domain URL	Webshop Name	Business Unit ID	Tags	Company Reply Date (UTC)	Location Name	Location ID
0	663d40378de0a14c26c2f63c	2024-05-09 23:29:00	663d4036d5fa24c223106005	A very good environment	A very good environment	5	AFSv2	en	http://www.puregym.com	PureGym UK	508df4ea00006400051dd7b1	NaN	2024-05-10 08:12:00	Solihull Sears Retail Park	7b03ccad-4a9d-4a33-9377-ea5bba442dfc
1	663d3c101ccfcc36fb28eb8c	2024-05-09 23:11:00	5f5e3434d53200fa6ac57238	I love to be part of this gym	I love to be part of this gym. Superb value for money. Any time, any day. Love the app too, well organised building ...	5	AFSv2	en	http://www.puregym.com	PureGym UK	508df4ea00006400051dd7b1	NaN	2024-05-10 08:13:00	Aylesbury	612d3f7e-18f9-492b-a36f-4a7b86fa5647
2	663d375859621080d08e6198	2024-05-09 22:51:00	57171ba90000ff000a18f905	Extremely busy	Extremely busy, no fresh air.	1	AFSv2	en	http://www.puregym.com	PureGym UK	508df4ea00006400051dd7b1	NaN	NaT	Sutton Times Square	0b78c808-f671-482b-8687-83468b7b5bc1

	Topic	Count	Name	Representation	Representative_Docs
0	-1	1471	-1_equipment_people_machines_staff	[equipment, people, machines, staff, one, time, dont, like, use, place]	[This place has gone down hill. Maybe a change in management is needed.\n\nThe gym is packed solid between 4pm-8pm a...
1	0	550	0_membership_pass_pin_day	[membership, pass, pin, day, code, get, access, day pass, email, didnt]	[I thought I could just turn up and ask to pay for a day pass at reception. There's no reception area..... scanned a...
2	1	211	1_air_hot_air conditioning_conditioning	[air, hot, air conditioning, conditioning, air con, con, ac, aircon, temperature, summer]	[Hednesford pure gym is like a sauna, the air conditioning hasn't been working since around May. I have put plenty o...
3	2	167	2_cleaning_dirty_clean_equipment	[cleaning, dirty, clean, equipment, stations, toilets, wipe, cleaning stations, machines, disgusting]	[This gym leaves a lot to be desired. I cancelled my membership here and joined a different 24 hour one ten minutes ...
4	3	146	3_toilets_toilet_changing_dirty	[toilets, toilet, changing, dirty, soap, smell, always, changing rooms, rooms, cleaning]	[Stop the cleans from sleeping in male toilets. Or sitting down hiding in the toilet on their phones. Having seen it...
5	4	137	4_class_classes_booked_instructor	[class, classes, booked, instructor, instructors, cancelled, time, spin, get, good]	[Not impressed with the classes or instructors taking the class, The gym has down hill but increased the fees , it s...
6	5	127	5_parking_car_park_free parking	[parking, car, park, free parking, free, fine, parking fine, fines, car park, ticket]	[Such a shame to have to write the review because I’ve always liked this gym. Was going before covid and never had a...
7	6	107	6_price_equipment_gyms_one	[price, equipment, gyms, one, also, month, would, machines, much, lot]	[I have been a member of a few Pure Gyms in Edinburgh since 2012, so was looking forward to the gym opening in Linli...
8	7	105	7_closed_open_247_hours	[closed, open, 247, hours, christmas, opening, day, days, 6am, 365]	[Turned up at my 24vgour unstaffed gym to find it is closed, I was inbrhe gym yesterday no notice no warning just cl...
9	8	87	8_showers_cold_shower_water	[showers, cold, shower, water, temperature, hot, changing, cold showers, rooms, warm]	[When I first joined PureGym the showers were nice and hot but the last few months they have been very cold, I asked...
10	9	86	9_manager_rude_member_staff	[manager, rude, member, staff, aggressive, us, voice, trainer, like, personal]	[Avoid this gym if you want to exercise in a friendly and clean space. The gym manager named DARIA UNIATOWSKA is ext...
11	10	77	10_equipment_broken_machines_missing	[equipment, broken, machines, missing, enough, equipment needs, equipments, lot equipment, poor, enough equipment]	[A running machine broken for weeks. Machines either side of it don't work despite as advised by staff holding Go bu...
12	11	77	11_equipment_good_weights_small	[equipment, good, weights, small, machines, better, space, people, free, enough]	[I'll start with the good points:\n\nThe location of the gym is great.\nThe trainers there are all really friendly a...
13	12	73	12_music_loud_noise_hear	[music, loud, noise, hear, volume, headphones, classes, cant hear, cant, music loud]	[Gym is fine but when a class is on they put the music so loud you can’t hear your own music. I’ve walked out the gy...
14	13	68	13_machines_fix_broken_machine	[machines, fix, broken, machine, leg, order, rowing, months, rowing machines, dont]	[Things are getting worse since I left my last review. Hand dryer in men's changing rooms - it has been out of use f...

	display_name	google_n	trustpilot_n	total
336	London Park Royal	47.0	137.0	184.0
209	Elkridge	183.0	0.0	183.0
453	Springfield	181.0	0.0	181.0
62	345	0.0	172.0	172.0
372	Manchester Market Street	125.0	29.0	154.0
344	London Stratford	93.0	56.0	149.0
310	London Finchley	91.0	51.0	142.0
270	Leicester Walnut Street	55.0	82.0	137.0
262	Leeds Bramley	98.0	28.0	126.0
424	Purley	82.0	42.0	124.0
308	London Enfield	71.0	53.0	124.0
375	Manchester Stretford	95.0	27.0	122.0
412	Peterborough Brotherhood Retail Park	67.0	55.0	122.0
84	Altrincham	96.0	22.0	118.0
238	Halifax	53.0	62.0	115.0
486	Tysons Corner	115.0	0.0	115.0
290	London Bermondsey	51.0	60.0	111.0
147	Burnham	38.0	72.0	110.0
466	Stoke on Trent North	78.0	31.0	109.0
346	London Swiss Cottage	54.0	53.0	107.0
509	Wolverhampton Bentley Bridge	64.0	42.0	106.0
419	Port Talbot	64.0	40.0	104.0
361	Maidenhead	50.0	51.0	101.0
342	London Southgate	72.0	28.0	100.0
316	London Hammersmith Palais	44.0	55.0	99.0
485	Tyldesley	58.0	39.0	97.0
516	York	26.0	70.0	96.0
394	Northwich	58.0	37.0	95.0
150	Caerphilly	48.0	46.0	94.0
224	Glasgow Giffnock	49.0	44.0	93.0

	Topic	Count	Name	Representation	Representative_Docs
0	-1	1369	-1_great_classes_class_equipment	[great, classes, class, equipment, good, always, staff, really, one, clean]	[I recently joined this gym and I must say, it has exceeded all my expectations. From the moment I walked in, I was ...
1	0	914	0_great_good_equipment_friendly	[great, good, equipment, friendly, staff, classes, machines, always, clean, nice]	[This is a Great gym, Really recommend the Gym classes to anyone joining ! Super good workout to great music & can w...
2	1	382	1_equipment_staff_clean_good	[equipment, staff, clean, good, friendly, great, facilities, helpful, atmosphere, nice]	[Easy to access. Clean and well maintained. Lots of equipment. Good atmosphere., Good atmosphere,friendly staff,go...
3	2	162	2_classes_class_great_great class	[classes, class, great, great class, great classes, instructors, love, fun, amazing, love classes]	[Great class, Great class!, Excellently classes]
4	3	145	3_cleaning_equipment_toilets_changing	[cleaning, equipment, toilets, changing, one, use, dirty, clean, machines, smell]	[Been coming here since January and I don’t have much to complain about. I’ve heard this location is better than mos...
5	4	126	4_membership_email_didnt_pin	[membership, email, didnt, pin, account, code, month, fee, pass, day pass]	[ANJA Is an Angel! I made a mistake of thinking I cancelled my membership! I swear I went to membership I clicked on...
6	5	116	5_fitness_classes_friendly_staff	[fitness, classes, friendly, staff, clean, trainers, great, equipment, ive, amazing]	[Pure Gym provides an exceptional fitness experience with its well-maintained equipment, spacious workout areas, div...
7	6	88	6_showers_toilets_shower_dirty	[showers, toilets, shower, dirty, changing, order, fix, cold, please, water]	[I find it really hard to access this gym due to people using the car park as their workplace or home parking. I oft...
8	7	64	7_easy_app_process_simple	[easy, app, process, simple, joining, join, easy use, online, straight, app easy]	[Simple and very easy, Easy to join., Very easy to do]
9	8	60	8_rude_manager_member_people	[rude, manager, member, people, im, voice, like, even, dont, staff]	[Avoid this gym if you want to exercise in a friendly and clean space. The gym manager named DARIA UNIATOWSKA is ext...
10	9	40	9_love_good_amazing_back	[love, good, amazing, back, loved, feeling, ok, perfect, nice, bit]	[Love it, Love it 😘, Love it here ive lost almost 4 stone feeling great]
11	10	35	10_circuits_jamie_class_circuits class	[circuits, jamie, class, circuits class, circuit, energy, full, tuesday, always, circuit class]	[Jamie Ts circuit class Tuesday evenings and Thursday mornings is a brilliant full body work out, Jamie is full of e...
12	11	34	11_class_andrea_step_step class	[class, andrea, step, step class, instructor, amazing, love class, really, best, week]	[Loved Andrea step class!!! It was an amazing workout, Andrea’s step class is amazing, wish there were more!, Andrea...
13	12	33	12_parking_park_retail park_car	[parking, park, retail park, car, retail, free, cars, free parking, hours, brotherhood retail]	[Your website boasts free parking. I wrongly made the assumption this was for members and not for people using it as...
14	13	31	13_staff_classes_friendly_friendly staff	[staff, classes, friendly, friendly staff, great, classes staff, really enjoy, really, enjoy, great staff]	[Great classes here and staff great too!, Really enjoy the classes . Staff are very helpful and location is perfect ...

	Topic	Count	Name	Representation	Representative_Docs
0	-1	629	-1_changing_staff_get_people	[changing, staff, get, people, equipment, showers, one, membership, ive, water]	[Standard pure gym and you get what you pay for but since I've been going in the last 6 months the toilets have been...
1	0	281	0_equipment_people_machines_weights	[equipment, people, machines, weights, use, phones, one, machine, time, busy]	[Extremely hot, extremely busy and extremely annoying. I will preface this by saying that I only have positive expe...
2	1	220	1_membership_access_cancel_month	[membership, access, cancel, month, email, app, pay, fee, get, customer]	[What went wrong was I have to buy a day pass on a different email to get access to this gym, I’ve got the plus mult...
3	2	155	2_staff_rude_member_members	[staff, rude, member, members, manager, people, weights, personal, one, said]	[Been going here for a couple months now.... two things really stuck out to me.\n1. Not a single weight will be in i...
4	3	90	3_membership_payment_cancel_contact	[membership, payment, cancel, contact, cancel membership, email, account, charged, money, cancelled]	[Paused my membership. Went on 3 weeks later and cancelled but as they don't send any confirmation emails I didn't r...
5	4	87	4_fee_joining_joining fee_code	[fee, joining, joining fee, code, charged, discount, promo, promo code, month, membership]	[JOINING FEE?? Why? While others offer NO JOINING FEE., I had a code to no joining fee and 3 months discount but it ...
6	5	77	5_class_classes_booked_cancelled	[class, classes, booked, cancelled, book, instructors, instructor, one, time, week]	[Absolute madness, booked classes and went to attend but no one was there to conduct class., The gym has down hill b...
7	6	73	6_rude_staff_manager_rude staff	[rude, staff, manager, rude staff, unprofessional, unhelpful, customers, manager rude, management, customer]	[The manager is very rude with the customers and very disrespectful.\nI have a horrible day., Staff are rude and ext...
8	7	70	7_crowded_busy_machines_enough	[crowded, busy, machines, enough, enough machines, many, equipment, many people, people, enough equipment]	[No enough machines, Too crowded, not enough equipment, Not enough machines to many people]
9	8	70	8_closed_open_christmas_247	[closed, open, christmas, 247, opening, hours, time, day, closing, 6am]	[Turned up at my 24vgour unstaffed gym to find it is closed, I was inbrhe gym yesterday no notice no warning just cl...

	Topic	Count	Name	Representation	Representative_Docs
0	-1	588	-1_rude staff_feedback_cost_rude	[rude staff, feedback, cost, rude, poorly, sharing, maintained, branch, arrogant, enforcement]	[rude staff, rude staff, rude staff]
1	0	55	0_personal_turnover_section_leaving	[personal, turnover, section, leaving, time personal, advice, departure, refusal, issues personal, worn]	[personal trainers, personal trainer socializing, personal trainers scams]
2	1	52	1_service_customer service_customer_poor	[service, customer service, customer, poor, support poor, service worst, customer response, service customer, poor p...	[poor customer service, poor customer service, poor customer service]
3	2	49	2_room_lock_broken_room issues	[room, lock, broken, room issues, odorous, faulty, room privacy, usage issue, mens, occupied]	[dirty locker room, dirty locker room, lock information missing]
4	3	47	3_machines broken_machines_machine_broken	[machines broken, machines, machine, broken, issue machines, usage, looked, machines machine, machines machines, bre...	[machines broken, machines broken, vending machines broken]
5	4	46	4_weights_weight_plates_free weights	[weights, weight, plates, free weights, left, free, disorganized, area, return, returned]	[weights too heavy, stealing weights, weights not reracked]
6	5	45	5_cancellation process_cancellation_cancellation policy_process	[cancellation process, cancellation, cancellation policy, process, notice, cancel, difficult, without notice, cancel...	[cancellation process, cancellation process, cancellation process]
7	6	43	6_pin_pin code_pin number_number issue	[pin, pin code, pin number, number issue, pin didnt, pin pin, code issue, didnt work, number, didnt]	[pin issue, pin issue, pin issue]
8	7	42	7_equipment issues_issues equipment_issue equipment_equipment	[equipment issues, issues equipment, issue equipment, equipment, unreliability, issue incorrect, misunderstanding, i...	[equipment issues, equipment issues, equipment issues]
9	8	41	8_membership cancellation_cancellation_membership_process membership	[membership cancellation, cancellation, membership, process membership, cancellation process, termination, consideri...	[membership cancellation, membership cancellation, membership cancellation]

PureGym NLP Topic Project — Basic Notebook¶

Setup¶

1. Install dependencies¶

2. Upload the two Excel files¶

3. Imports and NLTK data¶

Importing packages and data¶

Rubric item 1¶

Rubric item 2¶

Rubric item 3¶

Rubric item 3.1 — (our addition) Filter to English-only reviews¶

Conducting initial data investigation¶

Rubric item 4¶

Rubric item 4.1¶

Rubric item 5¶

Rubric item 6¶

Rubric item 7¶

Rubric item 8¶

Rubric item 9¶

Rubric item 10¶

Rubric item 11¶

Conducting initial topic modelling¶

Rubric item 12¶

Rubric item 13¶

Rubric item 14¶

Rubric item 15¶

Rubric item 16¶

Rubric item 17¶

Rubric item 18¶

Rubric item 19¶

Performing further data investigation¶

Rubric item 20¶

Rubric item 21¶

Rubric item 22¶

Rubric item 23¶

Conducting emotion analysis¶

Rubric item 24¶

Rubric item 25¶

Rubric item 26¶

Rubric item 27¶

Rubric item 28¶

Rubric item 29¶

Rubric item 30¶

Using a large language model¶

Rubric item 31¶

Rubric item 32¶

Rubric item 33¶

Rubric item 34¶

Rubric item 35¶

Rubric item 36¶

Rubric item 37¶

Using Gensim¶

Rubric item 38¶

Rubric item 39¶

Rubric item 40¶

Rubric item 41¶

Report¶

Rubric item 42¶

Rubric item 43¶

Rubric item 44¶

Rubric item 45¶

Rubric item 46¶

Rubric item 47¶

Rubric item 48¶

Report wireframe¶

1. Introduction (≈80 words)¶

2. Data (≈120 words)¶

3. Approach (≈150 words)¶

4. Findings (≈250 words)¶

5. Actionable insights (≈250 words)¶

6. Conclusion (≈100 words)¶

Appendices (V3 extras that don't fit the rubric but show analytical depth)¶

Addendum — Lessons Learned & Refinements¶

A. Methodology refinements — what we tried, what we kept¶

A.1 Major methodology pivots¶

A.2 BERTopic tuning specifics¶

A.3 Emotion classifier OOD handling¶

A.4 LLM extraction progression¶

A.5 Domain-specific preprocessing¶

B. Rubric-item-specific decisions¶

B.1 Items addressed straight per spec¶