ahessamb commited on
Commit
2492845
·
1 Parent(s): ab1e214

Add BERTopic model

Browse files
Files changed (4) hide show
  1. README.md +172 -0
  2. config.json +14 -0
  3. topic_embeddings.safetensors +3 -0
  4. topics.json +0 -0
README.md ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ tags:
4
+ - bertopic
5
+ library_name: bertopic
6
+ pipeline_tag: text-classification
7
+ ---
8
+
9
+ # bertopic-umap15-hbd15-topn15
10
+
11
+ This is a [BERTopic](https://github.com/MaartenGr/BERTopic) model.
12
+ BERTopic is a flexible and modular topic modeling framework that allows for the generation of easily interpretable topics from large datasets.
13
+
14
+ ## Usage
15
+
16
+ To use this model, please install BERTopic:
17
+
18
+ ```
19
+ pip install -U bertopic
20
+ ```
21
+
22
+ You can use the model as follows:
23
+
24
+ ```python
25
+ from bertopic import BERTopic
26
+ topic_model = BERTopic.load("ahessamb/bertopic-umap15-hbd15-topn15")
27
+
28
+ topic_model.get_topic_info()
29
+ ```
30
+
31
+ ## Topic overview
32
+
33
+ * Number of topics: 105
34
+ * Number of training documents: 14320
35
+
36
+ <details>
37
+ <summary>Click here for an overview of all topics.</summary>
38
+
39
+ | Topic ID | Topic Keywords | Topic Frequency | Label |
40
+ |----------|----------------|-----------------|-------|
41
+ | -1 | market - price - nft - said - cryptocurrency | 15 | -1_market_price_nft_said |
42
+ | 0 | korea - funds - attack - hackers - fraud | 6725 | 0_korea_funds_attack_hackers |
43
+ | 1 | usd - 500 - near - bitcoin - consolidating | 706 | 1_usd_500_near_bitcoin |
44
+ | 2 | sized - digest - news - blockchain - radar | 417 | 2_sized_digest_news_blockchain |
45
+ | 3 | merge - ethereum - proof - fork - beacon | 236 | 3_merge_ethereum_proof_fork |
46
+ | 4 | rate - cpi - hikes - fomc - bitcoin | 209 | 4_rate_cpi_hikes_fomc |
47
+ | 5 | luna - ustc - entropy - proposal - terraform | 207 | 5_luna_ustc_entropy_proposal |
48
+ | 6 | brands - meta - worlds - immersive - decentraland | 206 | 6_brands_meta_worlds_immersive |
49
+ | 7 | russia - sanctions - crypto - ruble - settlements | 187 | 7_russia_sanctions_crypto_ruble |
50
+ | 8 | gensler - securities - coinbase - industry - regulation | 178 | 8_gensler_securities_coinbase_industry |
51
+ | 9 | blockchain - web3 - gamers - p2e - industry | 174 | 9_blockchain_web3_gamers_p2e |
52
+ | 10 | miners - carbon - power - bitcoin - report | 157 | 10_miners_carbon_power_bitcoin |
53
+ | 11 | funding - round - ventures - capital - gamestop | 151 | 11_funding_round_ventures_capital |
54
+ | 12 | xrp - ripple - price - level - resistance | 146 | 12_xrp_ripple_price_level |
55
+ | 13 | etf - blackrock - grayscale - bitcoin - futures | 145 | 13_etf_blackrock_grayscale_bitcoin |
56
+ | 14 | web3 - disco - mcmullen - identity - platforms | 144 | 14_web3_disco_mcmullen_identity |
57
+ | 15 | protocols - decentralized - newsletter - cefi - lending | 141 | 15_protocols_decentralized_newsletter_cefi |
58
+ | 16 | inu - lucie - meme - tokens - ecosystem | 139 | 16_inu_lucie_meme_tokens |
59
+ | 17 | ftx - sam - bankman - bankruptcy - ceo | 132 | 17_ftx_sam_bankman_bankruptcy |
60
+ | 18 | tether - usdt - documents - coindesk - stablecoins | 123 | 18_tether_usdt_documents_coindesk |
61
+ | 19 | el - bukele - nayib - bitcoin - x93 | 120 | 19_el_bukele_nayib_bitcoin |
62
+ | 20 | dogecoin - musk - meme - twitter - level | 114 | 20_dogecoin_musk_meme_twitter |
63
+ | 21 | 26 - resistance - near - btc - bulls | 106 | 21_26_resistance_near_btc |
64
+ | 22 | nft - opensea - doppel - marketplaces - rug | 101 | 22_nft_opensea_doppel_marketplaces |
65
+ | 23 | cfds - traders - assets - cryptocurrency - adoption | 95 | 23_cfds_traders_assets_cryptocurrency |
66
+ | 24 | difficulty - hashrate - bitcoin - network - height | 90 | 24_difficulty_hashrate_bitcoin_network |
67
+ | 25 | ubi - cointelegraph - simonin - bitcoin - income | 88 | 25_ubi_cointelegraph_simonin_bitcoin |
68
+ | 26 | coinbase - bitkey - india - ceo - fees | 85 | 26_coinbase_bitkey_india_ceo |
69
+ | 27 | donated - russia - invasion - transformation - donors | 83 | 27_donated_russia_invasion_transformation |
70
+ | 28 | celsius - cel - withdrawals - company - mashinsky | 81 | 28_celsius_cel_withdrawals_company |
71
+ | 29 | nfts - collections - million - floor - cryptopunk | 81 | 29_nfts_collections_million_floor |
72
+ | 30 | blockchain - bvm - mvc - maestro - databases | 78 | 30_blockchain_bvm_mvc_maestro |
73
+ | 31 | crypto - merchants - mastercard - feature - cashapp | 78 | 31_crypto_merchants_mastercard_feature |
74
+ | 32 | ada - cardano - bearish - satoshis - market | 76 | 32_ada_cardano_bearish_satoshis |
75
+ | 33 | nft - sartoshi - artists - snoop - community | 75 | 33_nft_sartoshi_artists_snoop |
76
+ | 34 | solana - bearish - outages - fibonacci - resistance | 72 | 34_solana_bearish_outages_fibonacci |
77
+ | 35 | hinman - ripple - speech - emails - xrp | 71 | 35_hinman_ripple_speech_emails |
78
+ | 36 | oecd - taxation - framework - india - electronic | 70 | 36_oecd_taxation_framework_india |
79
+ | 37 | terraform - montenegro - korea - x93 - milojko | 69 | 37_terraform_montenegro_korea_x93 |
80
+ | 38 | order - securities - freeze - restraining - cyprus | 68 | 38_order_securities_freeze_restraining |
81
+ | 39 | manchester - sponsorship - bcci - com - fans | 68 | 39_manchester_sponsorship_bcci_com |
82
+ | 40 | surveyed - millennials - managers - crypto - report | 67 | 40_surveyed_millennials_managers_crypto |
83
+ | 41 | whales - eth - market - transactions - usdt | 66 | 41_whales_eth_market_transactions |
84
+ | 42 | binance - kazakhstan - changpeng - expansion - 500m | 61 | 42_binance_kazakhstan_changpeng_expansion |
85
+ | 43 | twitter - musk - metatime - jack - yaccarino | 59 | 43_twitter_musk_metatime_jack |
86
+ | 44 | rsi - price - line - altcoin - bullish | 59 | 44_rsi_price_line_altcoin |
87
+ | 45 | china - huobi - hkma - regulatory - companies | 57 | 45_china_huobi_hkma_regulatory |
88
+ | 46 | token - leo - surged - tlos - graph | 57 | 46_token_leo_surged_tlos |
89
+ | 47 | cbdcs - governor - banks - mit - project | 56 | 47_cbdcs_governor_banks_mit |
90
+ | 48 | daos - chorus - lieberman - decentralized - organizations | 51 | 48_daos_chorus_lieberman_decentralized |
91
+ | 49 | fungible - nonfungible - tokens - nft - 2021 | 51 | 49_fungible_nonfungible_tokens_nft |
92
+ | 50 | altcoins - levels - overhead - support - bounce | 50 | 50_altcoins_levels_overhead_support |
93
+ | 51 | yuan - digital - tax - cbdc - wallets | 43 | 51_yuan_digital_tax_cbdc |
94
+ | 52 | depot - company - invest - banking - america | 42 | 52_depot_company_invest_banking |
95
+ | 53 | markets - advice - bull - hodlers - nasdaily | 42 | 53_markets_advice_bull_hodlers |
96
+ | 54 | eth - level - breakout - tradingview - analysts | 38 | 54_eth_level_breakout_tradingview |
97
+ | 55 | nethereum - usd - struggling - resistance - performers | 37 | 55_nethereum_usd_struggling_resistance |
98
+ | 56 | ecoterra - trending - swords - presale - neo | 36 | 56_ecoterra_trending_swords_presale |
99
+ | 57 | securities - market - binance - coinbase - week | 34 | 57_securities_market_binance_coinbase |
100
+ | 58 | staking - eigenlayer - sip - ethereum - tokens | 33 | 58_staking_eigenlayer_sip_ethereum |
101
+ | 59 | founder - ethereum - forgotten - values - twitter | 33 | 59_founder_ethereum_forgotten_values |
102
+ | 60 | bnb - bauer - upgrade - ecosystem - network | 32 | 60_bnb_bauer_upgrade_ecosystem |
103
+ | 61 | price - rsi - bullish - chart - resistance | 32 | 61_price_rsi_bullish_chart |
104
+ | 62 | expiry - week - billion - derivatives - bet | 32 | 62_expiry_week_billion_derivatives |
105
+ | 63 | vasil - fork - mainnet - newest - scalability | 31 | 63_vasil_fork_mainnet_newest |
106
+ | 64 | microstrategy - saylor - btc - rumor - billion | 31 | 64_microstrategy_saylor_btc_rumor |
107
+ | 65 | metamask - browser - wallets - features - allows | 31 | 65_metamask_browser_wallets_features |
108
+ | 66 | uae - east - chainalysis - singapore - emerging | 31 | 66_uae_east_chainalysis_singapore |
109
+ | 67 | outflows - etps - products - week - funds | 31 | 67_outflows_etps_products_week |
110
+ | 68 | polygon - zcash - kakarot - starknet - protocol | 29 | 68_polygon_zcash_kakarot_starknet |
111
+ | 69 | japanese - jvcea - stablecoin - x93 - fatf | 29 | 69_japanese_jvcea_stablecoin_x93 |
112
+ | 70 | asic - miner - gpu - mi300x - ks3 | 28 | 70_asic_miner_gpu_mi300x |
113
+ | 71 | arrows - voyager - dcg - genesis - bankruptcy | 28 | 71_arrows_voyager_dcg_genesis |
114
+ | 72 | axie - infinity - program - ronin - upgrades | 26 | 72_axie_infinity_program_ronin |
115
+ | 73 | withdrawals - platform - freeway - halted - babel | 26 | 73_withdrawals_platform_freeway_halted |
116
+ | 74 | addresses - eth - glassnode - underwater - cryptos | 26 | 74_addresses_eth_glassnode_underwater |
117
+ | 75 | bottoming - dip - markets - chain - altcoins | 25 | 75_bottoming_dip_markets_chain |
118
+ | 76 | mica - eu - conglomerates - jurisdictions - framework | 25 | 76_mica_eu_conglomerates_jurisdictions |
119
+ | 77 | liquidations - resting - bid - order - 200 | 25 | 77_liquidations_resting_bid_order |
120
+ | 78 | listings - missed - announcements - usdt - exchanges | 25 | 78_listings_missed_announcements_usdt |
121
+ | 79 | cbdc - ripple - border - imf - currencies | 25 | 79_cbdc_ripple_border_imf |
122
+ | 80 | announcements - delisting - pair - listing - collection | 24 | 80_announcements_delisting_pair_listing |
123
+ | 81 | treasury - mixers - sanctioning - github - prank | 24 | 81_treasury_mixers_sanctioning_github |
124
+ | 82 | polkadot - parachains - auctions - opengov - referenda | 24 | 82_polkadot_parachains_auctions_opengov |
125
+ | 83 | hedge - investors - crypto - traditional - enriquez | 23 | 83_hedge_investors_crypto_traditional |
126
+ | 84 | level - resistance - cj - price - cryptocurrency | 23 | 84_level_resistance_cj_price |
127
+ | 85 | nexo - citibank - vauld - acquisitions - launched | 22 | 85_nexo_citibank_vauld_acquisitions |
128
+ | 86 | huobi - li - citing - pantronics - rumours | 22 | 86_huobi_li_citing_pantronics |
129
+ | 87 | nft - textbook - pill - sweeney - x9caccessible | 21 | 87_nft_textbook_pill_sweeney |
130
+ | 88 | bored - yacht - apecoin - justin - collection | 21 | 88_bored_yacht_apecoin_justin |
131
+ | 89 | apecoin - pattern - chart - head - roc | 21 | 89_apecoin_pattern_chart_head |
132
+ | 90 | subscription - investment - binance - dual - 06 | 20 | 90_subscription_investment_binance_dual |
133
+ | 91 | halving - correlation - nasdaq - 2024 - powell | 20 | 91_halving_correlation_nasdaq_2024 |
134
+ | 92 | announcements - delisting - listing - crypto - slice | 20 | 92_announcements_delisting_listing_crypto |
135
+ | 93 | adoption - nigeria - kucoin - lawful - aza | 18 | 93_adoption_nigeria_kucoin_lawful |
136
+ | 94 | staff - chatbot - layoffs - hr - terminations | 18 | 94_staff_chatbot_layoffs_hr |
137
+ | 95 | ethereum - network - batching - costs - tx | 18 | 95_ethereum_network_batching_costs |
138
+ | 96 | suarez - desantis - salary - city - candidate | 18 | 96_suarez_desantis_salary_city |
139
+ | 97 | circle - stablecoin - integrating - cybavo - worldpay | 17 | 97_circle_stablecoin_integrating_cybavo |
140
+ | 98 | stablecoins - paypal - plabasan - mhel - converge22 | 17 | 98_stablecoins_paypal_plabasan_mhel |
141
+ | 99 | week - tokens - tvl - locked - analytical | 17 | 99_week_tokens_tvl_locked |
142
+ | 100 | impairment - company - holdings - incurred - btc | 17 | 100_impairment_company_holdings_incurred |
143
+ | 101 | cbdc - familiarity - euro - ecb - respondents | 17 | 101_cbdc_familiarity_euro_ecb |
144
+ | 102 | marketplace - opensea - popularize - ftx - teaming | 16 | 102_marketplace_opensea_popularize_ftx |
145
+ | 103 | executive - leaving - bitstamp - genesis - samir | 15 | 103_executive_leaving_bitstamp_genesis |
146
+
147
+ </details>
148
+
149
+ ## Training hyperparameters
150
+
151
+ * calculate_probabilities: False
152
+ * language: None
153
+ * low_memory: False
154
+ * min_topic_size: 15
155
+ * n_gram_range: (1, 1)
156
+ * nr_topics: None
157
+ * seed_topic_list: None
158
+ * top_n_words: 5
159
+ * verbose: False
160
+
161
+ ## Framework versions
162
+
163
+ * Numpy: 1.22.4
164
+ * HDBSCAN: 0.8.29
165
+ * UMAP: 0.5.3
166
+ * Pandas: 1.5.3
167
+ * Scikit-Learn: 1.2.2
168
+ * Sentence-transformers: 2.2.2
169
+ * Transformers: 4.30.2
170
+ * Numba: 0.56.4
171
+ * Plotly: 5.13.1
172
+ * Python: 3.10.12
config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "calculate_probabilities": false,
3
+ "language": null,
4
+ "low_memory": false,
5
+ "min_topic_size": 15,
6
+ "n_gram_range": [
7
+ 1,
8
+ 1
9
+ ],
10
+ "nr_topics": null,
11
+ "seed_topic_list": null,
12
+ "top_n_words": 5,
13
+ "verbose": false
14
+ }
topic_embeddings.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a92ff04f4f8ecc82303c5ecf37a901e6f88db865993753e0db64d00a135a89c
3
+ size 322648
topics.json ADDED
The diff for this file is too large to render. See raw diff