{"id":15422,"date":"2023-04-25T06:01:47","date_gmt":"2023-04-24T22:01:47","guid":{"rendered":"https:\/\/www.tejwin.com\/?post_type=insight&#038;p=15422"},"modified":"2024-06-20T14:04:51","modified_gmt":"2024-06-20T06:04:51","slug":"%e3%80%90data-analysis%e3%80%91tesg-event-radar-topic-modeling","status":"publish","type":"insight","link":"https:\/\/www.tejwin.com\/en\/insight\/%e3%80%90data-analysis%e3%80%91tesg-event-radar-topic-modeling\/","title":{"rendered":"\u3010Data Analysis\u3011TESG Event Radar Topic Modeling"},"content":{"rendered":"\n<figure class=\"wp-block-image aligncenter size-full\" id=\"325c\"><img fetchpriority=\"high\" decoding=\"async\" width=\"1920\" height=\"1080\" src=\"https:\/\/www.tejwin.com\/wp-content\/uploads\/\u6563\u71b1-1.png\" alt=\"TESG\u4e8b\u4ef6\u96f7\u9054\u4e3b\u984c\u5206\u6790\" class=\"wp-image-24444\" srcset=\"https:\/\/www.tejwin.com\/wp-content\/uploads\/\u6563\u71b1-1.png 1920w, https:\/\/www.tejwin.com\/wp-content\/uploads\/\u6563\u71b1-1-300x169.png 300w, https:\/\/www.tejwin.com\/wp-content\/uploads\/\u6563\u71b1-1-1024x576.png 1024w, https:\/\/www.tejwin.com\/wp-content\/uploads\/\u6563\u71b1-1-150x84.png 150w, https:\/\/www.tejwin.com\/wp-content\/uploads\/\u6563\u71b1-1-768x432.png 768w, https:\/\/www.tejwin.com\/wp-content\/uploads\/\u6563\u71b1-1-1536x864.png 1536w\" sizes=\"(max-width: 1920px) 100vw, 1920px\" \/><\/figure>\n\n\n\n<div id=\"ez-toc-container\" class=\"ez-toc-v2_0_81 counter-hierarchy ez-toc-counter ez-toc-grey ez-toc-container-direction\">\n<p class=\"ez-toc-title\" style=\"cursor:inherit\">Table of Contents<\/p>\n<label for=\"ez-toc-cssicon-toggle-item-69f1340bb6615\" class=\"ez-toc-cssicon-toggle-label\"><span class=\"ez-toc-cssicon\"><span class=\"eztoc-hide\" style=\"display:none;\">Toggle<\/span><span class=\"ez-toc-icon-toggle-span\"><svg style=\"fill: #999;color:#999\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" class=\"list-377408\" width=\"20px\" height=\"20px\" viewBox=\"0 0 24 24\" fill=\"none\"><path d=\"M6 6H4v2h2V6zm14 0H8v2h12V6zM4 11h2v2H4v-2zm16 0H8v2h12v-2zM4 16h2v2H4v-2zm16 0H8v2h12v-2z\" fill=\"currentColor\"><\/path><\/svg><svg style=\"fill: #999;color:#999\" class=\"arrow-unsorted-368013\" xmlns=\"http:\/\/www.w3.org\/2000\/svg\" width=\"10px\" height=\"10px\" viewBox=\"0 0 24 24\" version=\"1.2\" baseProfile=\"tiny\"><path d=\"M18.2 9.3l-6.2-6.3-6.2 6.3c-.2.2-.3.4-.3.7s.1.5.3.7c.2.2.4.3.7.3h11c.3 0 .5-.1.7-.3.2-.2.3-.5.3-.7s-.1-.5-.3-.7zM5.8 14.7l6.2 6.3 6.2-6.3c.2-.2.3-.5.3-.7s-.1-.5-.3-.7c-.2-.2-.4-.3-.7-.3h-11c-.3 0-.5.1-.7.3-.2.2-.3.5-.3.7s.1.5.3.7z\"\/><\/svg><\/span><\/span><\/label><input type=\"checkbox\"  id=\"ez-toc-cssicon-toggle-item-69f1340bb6615\"  aria-label=\"Toggle\" \/><nav><ul class='ez-toc-list ez-toc-list-level-1 ' ><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-1\" href=\"https:\/\/www.tejwin.com\/en\/insight\/%e3%80%90data-analysis%e3%80%91tesg-event-radar-topic-modeling\/#Highlights\" >Highlights:<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-2\" href=\"https:\/\/www.tejwin.com\/en\/insight\/%e3%80%90data-analysis%e3%80%91tesg-event-radar-topic-modeling\/#Preface\" >Preface<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-3\" href=\"https:\/\/www.tejwin.com\/en\/insight\/%e3%80%90data-analysis%e3%80%91tesg-event-radar-topic-modeling\/#Background_Knowledge\" >Background Knowledge<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-4\" href=\"https:\/\/www.tejwin.com\/en\/insight\/%e3%80%90data-analysis%e3%80%91tesg-event-radar-topic-modeling\/#Programming_environment_and_Module_required\" >Programming environment and Module required<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-5\" href=\"https:\/\/www.tejwin.com\/en\/insight\/%e3%80%90data-analysis%e3%80%91tesg-event-radar-topic-modeling\/#Database\" >Database<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-6\" href=\"https:\/\/www.tejwin.com\/en\/insight\/%e3%80%90data-analysis%e3%80%91tesg-event-radar-topic-modeling\/#Data_Import\" >Data Import<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-7\" href=\"https:\/\/www.tejwin.com\/en\/insight\/%e3%80%90data-analysis%e3%80%91tesg-event-radar-topic-modeling\/#Prepossessing\" >Prepossessing<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-8\" href=\"https:\/\/www.tejwin.com\/en\/insight\/%e3%80%90data-analysis%e3%80%91tesg-event-radar-topic-modeling\/#Build_Model\" >Build Model<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-9\" href=\"https:\/\/www.tejwin.com\/en\/insight\/%e3%80%90data-analysis%e3%80%91tesg-event-radar-topic-modeling\/#Source_Code\" >Source Code<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-10\" href=\"https:\/\/www.tejwin.com\/en\/insight\/%e3%80%90data-analysis%e3%80%91tesg-event-radar-topic-modeling\/#%E5%BB%B6%E4%BC%B8%E9%96%B1%E8%AE%80\" >\u5ef6\u4f38\u95b1\u8b80<\/a><\/li><li class='ez-toc-page-1 ez-toc-heading-level-2'><a class=\"ez-toc-link ez-toc-heading-11\" href=\"https:\/\/www.tejwin.com\/en\/insight\/%e3%80%90data-analysis%e3%80%91tesg-event-radar-topic-modeling\/#%E7%9B%B8%E9%97%9C%E9%80%A3%E7%B5%90\" >\u76f8\u95dc\u9023\u7d50<\/a><\/li><\/ul><\/nav><\/div>\n<h2 class=\"wp-block-heading\" id=\"e60e\"><span class=\"ez-toc-section\" id=\"Highlights\"><\/span>Highlights:<span class=\"ez-toc-section-end\"><\/span><\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Difficulty\uff1a\u2605\u2605\u2605\u2606\u2606<\/li>\n\n\n\n<li>Using the data of \u201cTESG Event Radar\u201d to fulfill topic modeling.<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"a691\"><span class=\"ez-toc-section\" id=\"Preface\"><\/span>Preface<span class=\"ez-toc-section-end\"><\/span><\/h2>\n\n\n\n<p id=\"ec78\">Along with industrialization thriving, the severe impact on the natural environment, society, and international economy resulting from global-scale extreme climate changes gradually became the focus of national attention, such as ENSO(El Ni\u00f1o Southern Oscillation), rising sea-level, air pollution, etc. These impacts led nations to ponder how to co-exist with mother nature, and therefore, in 2005, the UN proposed \u201cenvironment,\u201d \u201csocial,\u201d and \u201cgovernance,\u201d which are so-called \u201cESG,\u201d should be included in cooperation\u2019s evaluation. The UN expected that doing so would positively affect society, the market, and individuals. Simultaneously, cooperations had to consider how to maintain revenue growth and achieve sustainable development under the unpredictable commercial environment.<\/p>\n\n\n\n<p id=\"7698\">However, countless pieces of information appear in the market every day; it is difficult for an individual to know and understand all of them.<br>Today we will use the \u201cTESG Event Radar\u201d data to fulfill topic modeling, which can help us easily and quickly understand the ESG trends or topics of open data of the government, shareholders meeting report, and ESG report.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"f211\"><span class=\"ez-toc-section\" id=\"Background_Knowledge\"><\/span><strong>Background Knowledge<\/strong><span class=\"ez-toc-section-end\"><\/span><\/h2>\n\n\n\n<p id=\"a162\">Generally, a text consists of multiple topics, the portion of each topic is different, and the frequency of occurrence of each keyword is also different. Topic modeling can analyze the words in the text, counting the distribution of possibility of which topic the text belongs to.<\/p>\n\n\n\n<p id=\"0190\">LDA (Latent Dirichlet Allocation) is an unsupervised topic model which is a normalized PLSI (Probabilistic latent semantic analysis) used for collecting, classifying, and dimension reduction in text.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"6bc1\"><span class=\"ez-toc-section\" id=\"Programming_environment_and_Module_required\"><\/span>Programming environment and Module required<span class=\"ez-toc-section-end\"><\/span><\/h2>\n\n\n\n<p id=\"abbb\">This article uses Mac OS as a system and Jupyter as an editor.<\/p>\n\n\n\n<pre id=\"5ca3\" class=\"wp-block-preformatted\"><span class=\"pre--content\"><span class=\"hljs-keyword\">import<\/span> tejapi\n\n<span class=\"hljs-comment\"># \u524d\u8655\u7406\u5957\u4ef6<\/span>\n<span class=\"hljs-keyword\">import<\/span> pandas <span class=\"hljs-keyword\">as<\/span> pd \n<span class=\"hljs-keyword\">import<\/span> re\n<span class=\"hljs-keyword\">import<\/span> numpy <span class=\"hljs-keyword\">as<\/span> np \n<span class=\"hljs-keyword\">from<\/span> datetime <span class=\"hljs-keyword\">import<\/span> datetime\n<span class=\"hljs-keyword\">from<\/span> ckip_transformers.nlp <span class=\"hljs-keyword\">import<\/span> CkipWordSegmenter, CkipPosTagger, CkipNerChunker\n\n<span class=\"hljs-comment\"># \u6a21\u578b\u5957\u4ef6<\/span>\n<span class=\"hljs-keyword\">from<\/span> numba <span class=\"hljs-keyword\">import<\/span> jit, cuda\n<span class=\"hljs-keyword\">import<\/span> gensim\n<span class=\"hljs-keyword\">from<\/span> gensim <span class=\"hljs-keyword\">import<\/span> corpora, models\n<span class=\"hljs-keyword\">from<\/span> gensim.models.coherencemodel <span class=\"hljs-keyword\">import<\/span> CoherenceModel\n<span class=\"hljs-keyword\">from<\/span> gensim.models.ldamodel <span class=\"hljs-keyword\">import<\/span> LdaModel\n\n<span class=\"hljs-comment\"># \u8996\u89ba\u5316\u5957\u4ef6<\/span>\n<span class=\"hljs-keyword\">import<\/span> matplotlib\n<span class=\"hljs-keyword\">import<\/span> matplotlib.pyplot <span class=\"hljs-keyword\">as<\/span> plt\n<span class=\"hljs-keyword\">import<\/span> pyLDAvis.gensim_models\n<span class=\"hljs-keyword\">from<\/span> wordcloud <span class=\"hljs-keyword\">import<\/span> WordCloud\n\n\n<span class=\"hljs-comment\"># \u8f14\u52a9\u5957\u4ef6<\/span>\n<span class=\"hljs-keyword\">import<\/span> warnings\nwarnings.filterwarnings(<span class=\"hljs-string\">\"ignore\"<\/span>)\n\ntejapi.ApiConfig.api_key = <span class=\"hljs-string\">\"Your Key\"<\/span>\ntejapi.ApiConfig.ignoretz = <span class=\"hljs-literal\">True<\/span><\/span><\/pre>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"4bc6\"><span class=\"ez-toc-section\" id=\"Database\"><\/span>Database<span class=\"ez-toc-section-end\"><\/span><\/h2>\n\n\n\n<p id=\"c68d\"><a href=\"https:\/\/api.tej.com.tw\/columns.html?idCode=TWN%2FAEWATCHA\" target=\"_blank\" rel=\"noreferrer noopener\" class=\"ek-link\">TESG Event Radar (TWN\/AEWATCHA)<\/a><\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"36c8\"><span class=\"ez-toc-section\" id=\"Data_Import\"><\/span>Data Import<span class=\"ez-toc-section-end\"><\/span><\/h2>\n\n\n\n<p id=\"70eb\">For the period from 2022\u201301\u201301 to 2023\u201304\u201301, we take TSMC(2330) as an example.<\/p>\n\n\n\n<pre id=\"7329\" class=\"wp-block-preformatted\"><span class=\"pre--content\">stock_id = <span class=\"hljs-string\">'2330'<\/span>, <span class=\"hljs-string\">'2303'<\/span>, <span class=\"hljs-string\">'2881'<\/span>, <span class=\"hljs-string\">'3045'<\/span>\ngte, lte = <span class=\"hljs-string\">'2022-01-01'<\/span>, <span class=\"hljs-string\">'2023-04-01'<\/span>\nTESG = tejapi.get(<span class=\"hljs-string\">'TWN\/AEWATCHA'<\/span>,\n                   paginate = True,\n                   coid = stock_id,\n                   mdate = {<span class=\"hljs-string\">'gte'<\/span>:gte, <span class=\"hljs-string\">'lte'<\/span>:lte},\n                  )\n\n<span class=\"hljs-built_in\">df<\/span> = TESG\n<span class=\"hljs-built_in\">df<\/span>[<span class=\"hljs-string\">\"mdate\"<\/span>] = pd.to_datetime(<span class=\"hljs-built_in\">df<\/span>[<span class=\"hljs-string\">\"mdate\"<\/span>])<\/span><\/pre>\n\n\n\n<figure class=\"wp-block-image\" id=\"8af7\"><img decoding=\"async\" src=\"https:\/\/www.tejwin.com\/wp-content\/uploads\/1NUmpAo2s1L0og3y0XppAPg-2.png\" alt=\"TESG\u4e8b\u4ef6\u96f7\u9054:\u8cc7\u6599\u6b04\u4f4d\u53ca\u578b\u614b\"\/><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"dfb1\"><span class=\"ez-toc-section\" id=\"Prepossessing\"><\/span>Prepossessing<span class=\"ez-toc-section-end\"><\/span><\/h2>\n\n\n\n<p id=\"737f\">We will use \u201cckip_transformers, \u201d the NLP package which Academia Sinica develops<strong>&nbsp;<\/strong>for Mandarin tokenization, POS tagging, and NER, the reason is unlike \u201cjieba\u201d or \u201csnowNLP,\u201d the packages designed by China, the tokenization result of \u201cckip_transformers\u201d is much more fit Taiwanese pattern.<\/p>\n\n\n\n<pre id=\"9fe1\" class=\"wp-block-preformatted\"><span class=\"pre--content\"><span class=\"hljs-comment\"># Initialize drivers<\/span>\n<span class=\"hljs-built_in\">print<\/span>(<span class=\"hljs-string\">\"Initializing drivers ... WS\"<\/span>)\n<span class=\"hljs-comment\"># device=0 \u70ba\u4f7f\u7528gpu\u9032\u884c\u904b\u7b97\uff0c\u5982\u96fb\u8166\u7121gpu\u8005\u53ef\u6539\u70ba device=-1 \u7528cpu\u904b\u7b97<\/span>\nws_driver = CkipWordSegmenter(model=<span class=\"hljs-string\">\"albert-base\"<\/span>, device=<span class=\"hljs-number\">0<\/span>)\n<span class=\"hljs-built_in\">print<\/span>(<span class=\"hljs-string\">\"Initializing drivers ... POS\"<\/span>)\npos_driver = CkipPosTagger(model=<span class=\"hljs-string\">\"bert-base\"<\/span>, device=<span class=\"hljs-number\">0<\/span>)\n<span class=\"hljs-built_in\">print<\/span>(<span class=\"hljs-string\">\"Initializing drivers ... NER\"<\/span>)<\/span><\/pre>\n\n\n\n<p id=\"1059\">The next step will be word segmentation and part-of-speech tagging. We use the \u201clambda\u201d function to process each piece of data. Its advantage is high efficiency and simple code. Compared with the \u201cfor\u201d loop, it can complete much data processing faster.<\/p>\n\n\n\n<pre id=\"f308\" class=\"wp-block-preformatted\"><span class=\"pre--content\"><span class=\"hljs-built_in\">df<\/span>[<span class=\"hljs-string\">\"seg\"<\/span>] = list(map(lambda x: ws_driver([x]), list(<span class=\"hljs-built_in\">df<\/span>[<span class=\"hljs-string\">\"newstxt_1\"<\/span>])))\n<span class=\"hljs-built_in\">df<\/span>[<span class=\"hljs-string\">\"seg\"<\/span>] = <span class=\"hljs-built_in\">df<\/span>[<span class=\"hljs-string\">\"seg\"<\/span>].apply(lambda x : x[0])\n<span class=\"hljs-built_in\">df<\/span>[<span class=\"hljs-string\">\"pos\"<\/span>] = <span class=\"hljs-built_in\">df<\/span>[<span class=\"hljs-string\">\"seg\"<\/span>].apply(lambda x : pos_driver(x))<\/span><\/pre>\n\n\n\n<figure class=\"wp-block-image caption-align-center\" id=\"977e\"><img decoding=\"async\" src=\"https:\/\/www.tejwin.com\/wp-content\/uploads\/1aZ5dByrzpCRn5G62rghXqw-2.png\" alt=\"\"\/><figcaption class=\"wp-element-caption\">\u6587\u7ae0\u65b7\u8a5e\u8207\u8a5e\u6027\u6a19\u8a3b\u7d50\u679c<\/figcaption><\/figure>\n\n\n\n<p id=\"c9b1\">From the above figure, we can see that a single word may have multiple parts of speech after part-of-speech tagging. For most languages, the meaning of sentences mainly concentrates on nouns and verbs. Therefore, in the next step, we need to filter out noun or verb words and put the result into a new column \u201cN_or_V.\u201d<\/p>\n\n\n\n<pre id=\"3e22\" class=\"wp-block-preformatted\"><span class=\"pre--content\"><span class=\"hljs-comment\"># \u8a5e\u6027\u904e\u6ffe<\/span>\n<span class=\"hljs-keyword\">def<\/span> <span class=\"hljs-title function_\">fltr_nv<\/span>(<span class=\"hljs-params\">word_lst, pos_lst<\/span>):\n  lst = []\n  <span class=\"hljs-keyword\">for<\/span> word, pos <span class=\"hljs-keyword\">in<\/span> <span class=\"hljs-built_in\">zip<\/span>(word_lst, pos_lst):\n    <span class=\"hljs-keyword\">for<\/span> i <span class=\"hljs-keyword\">in<\/span> pos:\n      <span class=\"hljs-keyword\">if<\/span> i.startswith((<span class=\"hljs-string\">\"N\"<\/span>, <span class=\"hljs-string\">\"V\"<\/span>)):\n        lst.append(word)\n        <span class=\"hljs-keyword\">break<\/span>\n  <span class=\"hljs-keyword\">return<\/span> lst\n\ndf[<span class=\"hljs-string\">\"N_or_V\"<\/span>] = df.apply(<span class=\"hljs-keyword\">lambda<\/span> x : fltr_nv(x[<span class=\"hljs-string\">\"seg\"<\/span>], x[<span class=\"hljs-string\">\"pos\"<\/span>]), axis = <span class=\"hljs-number\">1<\/span>)<\/span><\/pre>\n\n\n\n<p id=\"9672\">Let\u2019s check the time distribution of data by month.<\/p>\n\n\n\n<pre id=\"70c1\" class=\"wp-block-preformatted\"><span class=\"pre--content\"><span class=\"hljs-comment\"># \u8a08\u7b97\u6bcf\u6708\u6587\u7ae0\u6578<\/span>\ngb_corp = df_corp[[<span class=\"hljs-string\">\"mdate\"<\/span>, <span class=\"hljs-string\">\"N_or_V\"<\/span>]].groupby([df.mdate.dt.year, df.mdate.dt.month])\na = <span class=\"hljs-number\">0<\/span>\nlst = []\n\n<span class=\"hljs-keyword\">for<\/span> group_key, group_value <span class=\"hljs-keyword\">in<\/span> gb_corp:\n    group = gb_corp.get_group(group_key)\n    dct = {\n        <span class=\"hljs-string\">\"month\"<\/span> : datetime.strptime(<span class=\"hljs-built_in\">str<\/span>(group[<span class=\"hljs-string\">'mdate'<\/span>].iloc[<span class=\"hljs-number\">0<\/span>])[:<span class=\"hljs-number\">7<\/span>], <span class=\"hljs-string\">\"%Y-%m\"<\/span>),\n        <span class=\"hljs-string\">\"key_word\"<\/span> : [i <span class=\"hljs-keyword\">for<\/span> i <span class=\"hljs-keyword\">in<\/span> group[<span class=\"hljs-string\">'N_or_V'<\/span>]]\n    }\n    lst.append(dct)\n\n    <span class=\"hljs-built_in\">print<\/span>(<span class=\"hljs-string\">f\"<span class=\"hljs-subst\">{group_key}<\/span> : <span class=\"hljs-subst\">{<span class=\"hljs-built_in\">len<\/span>(group)}<\/span>\"<\/span>)\n    a+=<span class=\"hljs-built_in\">len<\/span>(group)\n<span class=\"hljs-built_in\">print<\/span>(a)<\/span><\/pre>\n\n\n\n<figure id=\"882c\" class=\"graf graf--figure graf-after--pre\">\n<\/figure>\n\n\n\n<figure class=\"wp-block-image aligncenter caption-align-center graf-image\"><img decoding=\"async\" src=\"https:\/\/www.tejwin.com\/wp-content\/uploads\/1wkw3Lk9tvZFiVjdJ3f_wtA-2.png\" alt=\"\u53f0\u7a4d\u96fb\u6bcf\u6708\u8cc7\u6599\u6578\u91cf\"\/><figcaption class=\"wp-element-caption\">\u53f0\u7a4d\u96fb\u6bcf\u6708\u8cc7\u6599\u6578<\/figcaption><\/figure>\n\n\n\n<p id=\"aec0\">We can see through the word cloud that although we did not use TF-IDF, Rext Rank, and other keyword extraction methods, the results obtained by only part-of-speech screening seem okay.<\/p>\n\n\n\n<pre id=\"75f5\" class=\"wp-block-preformatted\"><span class=\"pre--content\">!wget https:\/\/raw.githubusercontent.com\/victorgau\/wordcloud\/master\/SourceHanSansTW-Regular.otf -o \/dev\/null\n%matplotlib inline\n\n<span class=\"hljs-comment\"># \u5f9e Google \u4e0b\u8f09\u7684\u4e2d\u6587\u5b57\u578b<\/span>\nfont = <span class=\"hljs-string\">'SourceHanSansTW-Regular.otf'<\/span>\n\ndf_keyword = pd.DataFrame(lst)\ndf_keyword[<span class=\"hljs-string\">\"key_word\"<\/span>] = df_keyword[<span class=\"hljs-string\">\"key_word\"<\/span>].apply(<span class=\"hljs-keyword\">lambda<\/span> x : <span class=\"hljs-string\">\" \"<\/span>.join(x[<span class=\"hljs-number\">0<\/span>]))\ndf_keyword[<span class=\"hljs-string\">\"pic\"<\/span>] = df_keyword[<span class=\"hljs-string\">\"key_word\"<\/span>].apply(<span class=\"hljs-keyword\">lambda<\/span> x : WordCloud(font_path=font, max_words = <span class=\"hljs-number\">20<\/span>, background_color = <span class=\"hljs-string\">\"white\"<\/span>).generate(x))\n\nplt.imshow(df_keyword[<span class=\"hljs-string\">\"pic\"<\/span>].iloc[<span class=\"hljs-number\">10<\/span>])\nplt.axis(<span class=\"hljs-string\">\"off\"<\/span>)\nplt.show()<\/span><\/pre>\n\n\n\n<figure id=\"8949\" class=\"graf graf--figure graf-after--pre\">\n<\/figure>\n\n\n\n<figure class=\"wp-block-image aligncenter caption-align-center graf-image\"><img decoding=\"async\" src=\"https:\/\/www.tejwin.com\/wp-content\/uploads\/19M_jGXqKa_hln7JovkqWXg-2.png\" alt=\"\u6587\u5b57\u96f2\"\/><figcaption class=\"wp-element-caption\">\u6587\u5b57\u96f2<\/figcaption><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"adff\"><span class=\"ez-toc-section\" id=\"Build_Model\"><\/span>Build Model<span class=\"ez-toc-section-end\"><\/span><\/h2>\n\n\n\n<p id=\"5fda\">We use the gensim package to build the LDA model. The first step is to create a dictionary and give each word in the dictionary a corresponding number, and then calculate the number of times each number (word) appears in all articles.<\/p>\n\n\n\n<pre id=\"1004\" class=\"wp-block-preformatted\"><span class=\"pre--content\"><span class=\"hljs-comment\"># \u5c07\u904e\u6ffe\u5f8c\u7684\u55ae\u8a5e\u8f49\u63db\u70ba\u8f49\u63db\u70balist of list\u5f62\u5f0f<\/span>\nseg_lst = <span class=\"hljs-built_in\">list<\/span>(df_corp[<span class=\"hljs-string\">\"N_or_V\"<\/span>])\n\n<span class=\"hljs-comment\"># corpora.Dictionary() input \u662f\u6587\u5b57\u7684 list of list <\/span>\ndictionary = corpora.Dictionary(seg_lst)\n<span class=\"hljs-comment\"># corpus\u70ba (\u7de8\u865f\uff1a\u51fa\u73fe\u5b57\u6578) \u7684 list of list<\/span>\ncorpus = [dictionary.doc2bow(i) <span class=\"hljs-keyword\">for<\/span> i <span class=\"hljs-keyword\">in<\/span> seg_lst]<\/span><\/pre>\n\n\n\n<p id=\"2550\">The next step is to build a model but we encounter a problem. Since the LDA topic classification needs to be given the number of topics in advance, what is the most appropriate number of topics?<br>Here we use Log Perplexity and Topic Coherence to measure the number of topics.<\/p>\n\n\n\n<p id=\"c8db\">Log Perplexity reflects the level of \u201cuncertainty\u201d in the model\u2019s prediction results, that is, for an article, how uncertain we are that it belongs to a particular topic, so the more topics there are, the less perplexity. Still, it should be noted that when the number of topics is large, the generated model tends to overfit, so it is not possible to evaluate a model solely by perplexity.<\/p>\n\n\n\n<p id=\"95c0\">In contrast, Topic Coherence measures the semantic similarity between high-score words in topics. These measurements help to distinguish semantically explainable topics from topics based on statistics theories. The higher the score, the lower the consistency between topics. In general, the Coherence score increases with the number of topics. As the number of topics increases, the Coherence score increases diminishingly. At this time, the elbow technique (elbow method) is often used to make a trade-off between the number of topics and the coherence score.<\/p>\n\n\n\n<p id=\"e4a4\">Why these two methods can help us determine the number of topics, we won\u2019t go into detail here. The only concept needed at present is that the lower the Log Perplexity, the better the classification effect, and the higher the Topic Coherence, the better the classification effect.<\/p>\n\n\n\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\">\n<p id=\"3751\">Remind that there is no guideline for where Perplexity and Coherence should fall. The scores we get, and their values \u200b\u200bdepend on the data they are calculated on. For example, a score of 0.5 might be good enough in one situation but unacceptable in another. The<br>The only rule is to minimize the Perplexity score and maximize the Coherence score.<\/p>\n<\/blockquote>\n\n\n\n<pre id=\"ac70\" class=\"wp-block-preformatted\"><span class=\"pre--content\"><span class=\"hljs-comment\"># \u56f0\u60d1\u5ea6\u8a08\u7b97<\/span>\n<span class=\"hljs-keyword\">def<\/span> <span class=\"hljs-title function_\">perplexity<\/span>(<span class=\"hljs-params\">num_topics<\/span>):\n    ldamodel = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes = <span class=\"hljs-number\">30<\/span>)\n    <span class=\"hljs-built_in\">print<\/span>(ldamodel.print_topics(num_topics = num_topics, num_words = <span class=\"hljs-number\">15<\/span>))\n    <span class=\"hljs-built_in\">print<\/span>(ldamodel.log_perplexity(corpus))\n    <span class=\"hljs-keyword\">return<\/span> ldamodel.log_perplexity(corpus)\n\n<span class=\"hljs-comment\"># \u4e3b\u984c\u4e00\u81f4\u6027\u8a08\u7b97<\/span>\n<span class=\"hljs-keyword\">def<\/span> <span class=\"hljs-title function_\">coherence<\/span>(<span class=\"hljs-params\">num_topics<\/span>):\n    ldamodel = LdaModel(corpus, num_topics = num_topics, id2word = dictionary, passes = <span class=\"hljs-number\">30<\/span>, random_state = <span class=\"hljs-number\">42<\/span>)\n    <span class=\"hljs-built_in\">print<\/span>(ldamodel.print_topics(num_topics = num_topics, num_words = <span class=\"hljs-number\">15<\/span>))\n    ldacm = CoherenceModel(model = ldamodel, texts = seg_lst, dictionary = dictionary, coherence=<span class=\"hljs-string\">\"c_v\"<\/span>)\n    <span class=\"hljs-built_in\">print<\/span>(ldacm.get_coherence())\n    <span class=\"hljs-keyword\">return<\/span> ldacm.get_coherence()<\/span><\/pre>\n\n\n\n<figure id=\"2267\" class=\"graf graf--figure graf-after--pre\">\n<\/figure>\n\n\n\n<figure class=\"wp-block-image aligncenter caption-align-center graf-image\"><img decoding=\"async\" src=\"https:\/\/www.tejwin.com\/wp-content\/uploads\/1Fqgo9H36rkUS_PdMppYJGw-2.png\" alt=\"TESG\u4e8b\u4ef6\u96f7\u9054:topic-perplexity variation\"\/><figcaption class=\"wp-element-caption\">TESG\u4e8b\u4ef6\u96f7\u9054:topic-perplexity variation<\/figcaption><\/figure>\n\n\n\n<p id=\"fdc7\">From the above figure, we can find that Log Perplexity will drop sharply after nine topics, which implies that the model may be overfitting at ten, so the number of topics we choose should be less than 10, so the scale of the number of topics we input into Topic Coherence will be 1 ~ 9.<\/p>\n\n\n\n<figure id=\"71d3\" class=\"graf graf--figure graf-after--p\">\n<\/figure>\n\n\n\n<figure class=\"wp-block-image aligncenter caption-align-center graf-image\"><img decoding=\"async\" src=\"https:\/\/www.tejwin.com\/wp-content\/uploads\/147TQoy5U0JtfhrTHIBUcfw-2.png\" alt=\"TESG\u4e8b\u4ef6\u96f7\u9054:topic-coherence variation\"\/><figcaption class=\"wp-element-caption\">&nbsp;TESG\u4e8b\u4ef6\u96f7\u9054:topic-coherence variation<\/figcaption><\/figure>\n\n\n\n<figcaption class=\"imageCaption\"><\/figcaption>\n\n\n\n<p id=\"6d1c\">The results show that the classification model has the highest score when the number of topics is 7. We have determined that the articles should be classified into seven topics and then input the number of topics into the model.<\/p>\n\n\n\n<pre id=\"40cf\" class=\"wp-block-preformatted\"><span class=\"pre--content\">num_topics = <span class=\"hljs-number\">7<\/span>\nlda = LdaModel(corpus, num_topics = num_topics, id2word = dictionary, passes = <span class=\"hljs-number\">30<\/span>, random_state = <span class=\"hljs-number\">42<\/span>)\n<span class=\"hljs-comment\">#  \u5370\u51fa\u6bcf\u500b\u4e3b\u984c\u4e2d\u7684\u524d15\u500b\u95dc\u9375\u5b57\u8a5e<\/span>\ntopics_lst = lda.print_topics()\n<span class=\"hljs-built_in\">print<\/span>(topics_lst)<\/span><\/pre>\n\n\n\n<p id=\"11c6\">In this way, the model is completed. We use the visualization package \u201cpyLDAvis\u201d to present the classification results of the model. The detailed visualization code will be provided at the end of the article.<\/p>\n\n\n\n<figure id=\"5d69\" class=\"graf graf--figure graf-after--p\">\n<\/figure>\n\n\n\n<figure class=\"wp-block-image aligncenter caption-align-center graf-image\"><img decoding=\"async\" src=\"https:\/\/www.tejwin.com\/wp-content\/uploads\/1btU2-ZvckPj0R5pSxIr_qA-2.png\" alt=\"TESG\u4e8b\u4ef6\u96f7\u9054:pyLDAvis visialization\"\/><figcaption class=\"wp-element-caption\">TESG\u4e8b\u4ef6\u96f7\u9054:pyLDAvis visialization<\/figcaption><\/figure>\n\n\n\n<figcaption class=\"imageCaption\"><\/figcaption>\n\n\n\n<p id=\"cde9\">The visualization of pyLDAvis contains two charts.<br>On the left side is the result of the classification of topics. Each circle represents one topic; the larger the area, the more articles be classified in this topic. The axes result from PCA (Principal Component Analysis ), x-axis is PC1, and y-axis is PC2; the distance between circles reflects the similarity of the two topics.<br>The other side is the keyword statistics; the blue bar chart is the times of occurrence of keywords in all articles, and the red bar chart is the times of occurrence of keywords in your chosen topic. Adjusting the \u03bb value above can display the unique keywords in this topic; the lower \u03bb value, the more unique keywords.<\/p>\n\n\n\n<p id=\"5309\">The classification of topics is not bad, except there is a slight overlapping at the sixth and the seventh topic; others are pretty well.<br>However, the LDA method used to classify depends on the math. Maybe the result can be clearly and straightly explained by math,&nbsp;<strong>but it usually doesn\u2019t match the human\u2019s judgment and sometimes even appear anticorrelation.<\/strong>&nbsp;From a human\u2019s perspective, these results sometimes have a vague concept or logic.<\/p>\n\n\n\n<p id=\"7490\">And whether the data has been properly pre-processed will also significantly affect the final classification results. There is an old saying in the IT industry, \u201cgarbage in, garbage out.\u201d You can get ideal results by doing an excellent job in data pre-processing. However, pre-processing has many complicated and tedious procedures. Different task requirements often require a certain degree of professional knowledge and relevant experience, which one\u2019s strength cannot quickly achieve.<\/p>\n\n\n\n<p id=\"e88b\">TESG Event Radar, through massive data sources, analysis by professional researchers, and natural language processing models, lets users no longer need to have the complex mentioned above and high-threshold technologies and knowledge to grasp the latest news of ESG in various companies quickly.<\/p>\n\n\n\n<p id=\"329d\">TESG Event Radar has four significant advantages:<\/p>\n\n\n\n<p id=\"fe12\"><strong>Multiple sources of events:<br><\/strong>Include more than 20 public sources of information, pay attention to the occurrence of various ESG events of enterprises, follow up the dynamics of enterprises.<\/p>\n\n\n\n<p id=\"4b7c\"><strong>Sustainable event classification:<\/strong><br>Based on TESG, events are divided into three categories and 16 subcategories so as to understand event attributes and impact levels quickly.<\/p>\n\n\n\n<p id=\"7d37\"><strong>Intensity marker:<\/strong><br>Quickly identify the magnitude of an event\u2019s impact.<\/p>\n\n\n\n<p id=\"901c\"><strong>Keypoint marker:<\/strong><br>Identify the nature of the event\/the amount of the penalty.<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"5a0f\"><span class=\"ez-toc-section\" id=\"Source_Code\"><\/span><strong>Source Code<\/strong><span class=\"ez-toc-section-end\"><\/span><\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li><a href=\"https:\/\/gist.github.com\/tej87681088\/18f917d05baff33e76d938ac74f52a5f\" class=\"ek-link\" target=\"_blank\" rel=\"noopener\">GitHub<\/a><\/li>\n<\/ul>\n\n\n\n<figure id=\"4e11\" class=\"graf graf--figure graf--iframe graf-after--h4\"><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"098b\"><span class=\"ez-toc-section\" id=\"%E5%BB%B6%E4%BC%B8%E9%96%B1%E8%AE%80\"><\/span>\u5ef6\u4f38\u95b1\u8b80<span class=\"ez-toc-section-end\"><\/span><\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li><a href=\"https:\/\/www.tejwin.com\/en\/insight\/women-in-boardroom-in-2023\/\" class=\"ek-link\">Women in Boardroom in 2023: From Critical Minority to Majority<\/a><\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"a5f3\"><span class=\"ez-toc-section\" id=\"%E7%9B%B8%E9%97%9C%E9%80%A3%E7%B5%90\"><\/span>\u76f8\u95dc\u9023\u7d50<span class=\"ez-toc-section-end\"><\/span><\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li><a href=\"https:\/\/api.tej.com.tw\/index.html\" class=\"ek-link\" target=\"_blank\" rel=\"noopener\">TEJ API <\/a><\/li>\n\n\n\n<li><a href=\"https:\/\/www.tejwin.com\/solution\/esg-solution\/\" target=\"_blank\" aria-label=\" (opens in a new tab)\" rel=\"noreferrer noopener\" class=\"ek-link\">TESG<\/a><\/li>\n<\/ul>\n","protected":false},"excerpt":{"rendered":"<p>Today we will use the \u201cTESG Event Radar\u201d data to fulfill topic modeling, which can help us easily and quickly understand the ESG trends or topics of open data of the government, shareholders meeting report, and ESG report.<\/p>\n","protected":false},"featured_media":24444,"template":"","tags":[2352,2610,2642],"insight-category":[690,3509,50],"class_list":["post-15422","insight","type-insight","status-publish","has-post-thumbnail","hentry","tag-esg","tag-nlp","tag-topic-modeling","insight-category-data-analysis","insight-category-fintech-en","insight-category-fintech"],"acf":[],"_links":{"self":[{"href":"https:\/\/www.tejwin.com\/en\/wp-json\/wp\/v2\/insight\/15422","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.tejwin.com\/en\/wp-json\/wp\/v2\/insight"}],"about":[{"href":"https:\/\/www.tejwin.com\/en\/wp-json\/wp\/v2\/types\/insight"}],"version-history":[{"count":3,"href":"https:\/\/www.tejwin.com\/en\/wp-json\/wp\/v2\/insight\/15422\/revisions"}],"predecessor-version":[{"id":24448,"href":"https:\/\/www.tejwin.com\/en\/wp-json\/wp\/v2\/insight\/15422\/revisions\/24448"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.tejwin.com\/en\/wp-json\/wp\/v2\/media\/24444"}],"wp:attachment":[{"href":"https:\/\/www.tejwin.com\/en\/wp-json\/wp\/v2\/media?parent=15422"}],"wp:term":[{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.tejwin.com\/en\/wp-json\/wp\/v2\/tags?post=15422"},{"taxonomy":"insight-category","embeddable":true,"href":"https:\/\/www.tejwin.com\/en\/wp-json\/wp\/v2\/insight-category?post=15422"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}