Last active
July 11, 2021 07:45
-
-
Save neoyipeng2018/7a1d601f57d74714bfdcdfb8174e458d to your computer and use it in GitHub Desktop.
GenerateNews.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "name": "GenerateNews.ipynb", | |
| "provenance": [], | |
| "collapsed_sections": [], | |
| "authorship_tag": "ABX9TyPR8/gD8lSjdhaOZaN3OQqW", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| }, | |
| "accelerator": "GPU", | |
| "widgets": { | |
| "application/vnd.jupyter.widget-state+json": { | |
| "6567677206674378b6fc730cad81ab37": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HBoxModel", | |
| "state": { | |
| "_view_name": "HBoxView", | |
| "_dom_classes": [], | |
| "_model_name": "HBoxModel", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "box_style": "", | |
| "layout": "IPY_MODEL_f3bbb128cf9547229ade5a6bad8dd824", | |
| "_model_module": "@jupyter-widgets/controls", | |
| "children": [ | |
| "IPY_MODEL_619fc2f268c840eda52f1c5720ea383b", | |
| "IPY_MODEL_d57bd6ce7c21425c96f236acfc0c06f5" | |
| ] | |
| } | |
| }, | |
| "f3bbb128cf9547229ade5a6bad8dd824": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "619fc2f268c840eda52f1c5720ea383b": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "FloatProgressModel", | |
| "state": { | |
| "_view_name": "ProgressView", | |
| "style": "IPY_MODEL_56f32b2190704f8087f416234b6f13b7", | |
| "_dom_classes": [], | |
| "description": "Downloading: 100%", | |
| "_model_name": "FloatProgressModel", | |
| "bar_style": "success", | |
| "max": 10422021287, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": 10422021287, | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "orientation": "horizontal", | |
| "min": 0, | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_2c6cb3481d1240c9a2842c48afe423f9" | |
| } | |
| }, | |
| "d57bd6ce7c21425c96f236acfc0c06f5": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "state": { | |
| "_view_name": "HTMLView", | |
| "style": "IPY_MODEL_8c0c021283af40d9a6514425796e450a", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "HTMLModel", | |
| "placeholder": "", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": " 10.4G/10.4G [04:07<00:00, 42.0MB/s]", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_7106cb27acf24f0da05edeffb395cb9e" | |
| } | |
| }, | |
| "56f32b2190704f8087f416234b6f13b7": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "ProgressStyleModel", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "ProgressStyleModel", | |
| "description_width": "initial", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "bar_color": null, | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "2c6cb3481d1240c9a2842c48afe423f9": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "8c0c021283af40d9a6514425796e450a": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "DescriptionStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "7106cb27acf24f0da05edeffb395cb9e": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "e18747d9fe06419fbffdc21fe9821aeb": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HBoxModel", | |
| "state": { | |
| "_view_name": "HBoxView", | |
| "_dom_classes": [], | |
| "_model_name": "HBoxModel", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "box_style": "", | |
| "layout": "IPY_MODEL_cd9c23449cb344e1a401b7eb1a4b4219", | |
| "_model_module": "@jupyter-widgets/controls", | |
| "children": [ | |
| "IPY_MODEL_b8aad056d3614aed973c73cd696617b2", | |
| "IPY_MODEL_fca4aac93b164128a43b25fe8ca7a53e" | |
| ] | |
| } | |
| }, | |
| "cd9c23449cb344e1a401b7eb1a4b4219": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "b8aad056d3614aed973c73cd696617b2": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "FloatProgressModel", | |
| "state": { | |
| "_view_name": "ProgressView", | |
| "style": "IPY_MODEL_01b9ad19d57a4b2abfa86001a9bfb411", | |
| "_dom_classes": [], | |
| "description": "Downloading: 100%", | |
| "_model_name": "FloatProgressModel", | |
| "bar_style": "success", | |
| "max": 713229, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": 713229, | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "orientation": "horizontal", | |
| "min": 0, | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_a802fc38b087422ca4e83cf53eb13601" | |
| } | |
| }, | |
| "fca4aac93b164128a43b25fe8ca7a53e": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "state": { | |
| "_view_name": "HTMLView", | |
| "style": "IPY_MODEL_1c968bc1b5b544beab6cdb877993d36c", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "HTMLModel", | |
| "placeholder": "", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": " 713k/713k [05:04<00:00, 2.34kB/s]", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_a98d174f6a564f9bb1c368617b037bb0" | |
| } | |
| }, | |
| "01b9ad19d57a4b2abfa86001a9bfb411": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "ProgressStyleModel", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "ProgressStyleModel", | |
| "description_width": "initial", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "bar_color": null, | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "a802fc38b087422ca4e83cf53eb13601": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "1c968bc1b5b544beab6cdb877993d36c": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "DescriptionStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "a98d174f6a564f9bb1c368617b037bb0": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "d9ba2fb5c8d540a4a72907721298e067": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HBoxModel", | |
| "state": { | |
| "_view_name": "HBoxView", | |
| "_dom_classes": [], | |
| "_model_name": "HBoxModel", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "box_style": "", | |
| "layout": "IPY_MODEL_c2583447215e46848227d8a48472a917", | |
| "_model_module": "@jupyter-widgets/controls", | |
| "children": [ | |
| "IPY_MODEL_c66cac0b642c49f69a66ed25b972634b", | |
| "IPY_MODEL_38945155fe594fd380474b78aa58dc90" | |
| ] | |
| } | |
| }, | |
| "c2583447215e46848227d8a48472a917": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "c66cac0b642c49f69a66ed25b972634b": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "FloatProgressModel", | |
| "state": { | |
| "_view_name": "ProgressView", | |
| "style": "IPY_MODEL_2b1f1e8110394208b211a0841f7eb73c", | |
| "_dom_classes": [], | |
| "description": "Downloading: 100%", | |
| "_model_name": "FloatProgressModel", | |
| "bar_style": "success", | |
| "max": 801, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": 801, | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "orientation": "horizontal", | |
| "min": 0, | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_98a3741919534591abafcc2460b5dd0a" | |
| } | |
| }, | |
| "38945155fe594fd380474b78aa58dc90": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "state": { | |
| "_view_name": "HTMLView", | |
| "style": "IPY_MODEL_9261b417cbf74af6989cd8bbae0694f7", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "HTMLModel", | |
| "placeholder": "", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": " 801/801 [00:00<00:00, 2.65kB/s]", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_c0d7730620a04f4abc614497b34024cd" | |
| } | |
| }, | |
| "2b1f1e8110394208b211a0841f7eb73c": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "ProgressStyleModel", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "ProgressStyleModel", | |
| "description_width": "initial", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "bar_color": null, | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "98a3741919534591abafcc2460b5dd0a": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "9261b417cbf74af6989cd8bbae0694f7": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "DescriptionStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "c0d7730620a04f4abc614497b34024cd": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "8862ae9abd784646a4c5c5479c2dddeb": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HBoxModel", | |
| "state": { | |
| "_view_name": "HBoxView", | |
| "_dom_classes": [], | |
| "_model_name": "HBoxModel", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "box_style": "", | |
| "layout": "IPY_MODEL_61c8852a7442462fac25403c4943ecb9", | |
| "_model_module": "@jupyter-widgets/controls", | |
| "children": [ | |
| "IPY_MODEL_4b1874aa6b24472888444fb333c49d6b", | |
| "IPY_MODEL_2fa2f4ba8e5c4d09909f36e2eaf28a6a" | |
| ] | |
| } | |
| }, | |
| "61c8852a7442462fac25403c4943ecb9": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "4b1874aa6b24472888444fb333c49d6b": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "FloatProgressModel", | |
| "state": { | |
| "_view_name": "ProgressView", | |
| "style": "IPY_MODEL_08e0f274cbda4bec939303e38442901c", | |
| "_dom_classes": [], | |
| "description": "Downloading: 100%", | |
| "_model_name": "FloatProgressModel", | |
| "bar_style": "success", | |
| "max": 10388738808, | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": 10388738808, | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "orientation": "horizontal", | |
| "min": 0, | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_91eaa3d93d6440fc86895ec8b74ceff1" | |
| } | |
| }, | |
| "2fa2f4ba8e5c4d09909f36e2eaf28a6a": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "HTMLModel", | |
| "state": { | |
| "_view_name": "HTMLView", | |
| "style": "IPY_MODEL_7946cd2135ab4ab885d78d386d2ac90b", | |
| "_dom_classes": [], | |
| "description": "", | |
| "_model_name": "HTMLModel", | |
| "placeholder": "", | |
| "_view_module": "@jupyter-widgets/controls", | |
| "_model_module_version": "1.5.0", | |
| "value": " 10.4G/10.4G [05:03<00:00, 34.2MB/s]", | |
| "_view_count": null, | |
| "_view_module_version": "1.5.0", | |
| "description_tooltip": null, | |
| "_model_module": "@jupyter-widgets/controls", | |
| "layout": "IPY_MODEL_faab398bff2b44e988f29c7c574092c8" | |
| } | |
| }, | |
| "08e0f274cbda4bec939303e38442901c": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "ProgressStyleModel", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "ProgressStyleModel", | |
| "description_width": "initial", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "bar_color": null, | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "91eaa3d93d6440fc86895ec8b74ceff1": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| }, | |
| "7946cd2135ab4ab885d78d386d2ac90b": { | |
| "model_module": "@jupyter-widgets/controls", | |
| "model_name": "DescriptionStyleModel", | |
| "state": { | |
| "_view_name": "StyleView", | |
| "_model_name": "DescriptionStyleModel", | |
| "description_width": "", | |
| "_view_module": "@jupyter-widgets/base", | |
| "_model_module_version": "1.5.0", | |
| "_view_count": null, | |
| "_view_module_version": "1.2.0", | |
| "_model_module": "@jupyter-widgets/controls" | |
| } | |
| }, | |
| "faab398bff2b44e988f29c7c574092c8": { | |
| "model_module": "@jupyter-widgets/base", | |
| "model_name": "LayoutModel", | |
| "state": { | |
| "_view_name": "LayoutView", | |
| "grid_template_rows": null, | |
| "right": null, | |
| "justify_content": null, | |
| "_view_module": "@jupyter-widgets/base", | |
| "overflow": null, | |
| "_model_module_version": "1.2.0", | |
| "_view_count": null, | |
| "flex_flow": null, | |
| "width": null, | |
| "min_width": null, | |
| "border": null, | |
| "align_items": null, | |
| "bottom": null, | |
| "_model_module": "@jupyter-widgets/base", | |
| "top": null, | |
| "grid_column": null, | |
| "overflow_y": null, | |
| "overflow_x": null, | |
| "grid_auto_flow": null, | |
| "grid_area": null, | |
| "grid_template_columns": null, | |
| "flex": null, | |
| "_model_name": "LayoutModel", | |
| "justify_items": null, | |
| "grid_row": null, | |
| "max_height": null, | |
| "align_content": null, | |
| "visibility": null, | |
| "align_self": null, | |
| "height": null, | |
| "min_height": null, | |
| "padding": null, | |
| "grid_auto_rows": null, | |
| "grid_gap": null, | |
| "max_width": null, | |
| "order": null, | |
| "_view_module_version": "1.2.0", | |
| "grid_template_areas": null, | |
| "object_position": null, | |
| "object_fit": null, | |
| "grid_auto_columns": null, | |
| "margin": null, | |
| "display": null, | |
| "left": null | |
| } | |
| } | |
| } | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/neoyipeng2018/7a1d601f57d74714bfdcdfb8174e458d/generatenews.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "oNLR2lAtxXiq", | |
| "outputId": "8d29578b-bd47-4434-db5e-8cfdb82028b9" | |
| }, | |
| "source": [ | |
| "!nvidia-smi" | |
| ], | |
| "execution_count": 32, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Sun Jul 11 07:39:17 2021 \n", | |
| "+-----------------------------------------------------------------------------+\n", | |
| "| NVIDIA-SMI 470.42.01 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", | |
| "|-------------------------------+----------------------+----------------------+\n", | |
| "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", | |
| "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", | |
| "| | | MIG M. |\n", | |
| "|===============================+======================+======================|\n", | |
| "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", | |
| "| N/A 71C P0 29W / 70W | 1418MiB / 15109MiB | 0% Default |\n", | |
| "| | | N/A |\n", | |
| "+-------------------------------+----------------------+----------------------+\n", | |
| " \n", | |
| "+-----------------------------------------------------------------------------+\n", | |
| "| Processes: |\n", | |
| "| GPU GI CI PID Type Process name GPU Memory |\n", | |
| "| ID ID Usage |\n", | |
| "|=============================================================================|\n", | |
| "| No running processes found |\n", | |
| "+-----------------------------------------------------------------------------+\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "uahXc3R_xMh8", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "outputId": "269c4cbf-9f08-423c-8581-782cb9ab06f9" | |
| }, | |
| "source": [ | |
| "!pip install transformers\n", | |
| "!pip install jieba\n", | |
| "!pip install sentencepiece" | |
| ], | |
| "execution_count": 33, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Requirement already satisfied: transformers in /usr/local/lib/python3.7/dist-packages (4.8.2)\n", | |
| "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n", | |
| "Requirement already satisfied: huggingface-hub==0.0.12 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.0.12)\n", | |
| "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from transformers) (3.13)\n", | |
| "Requirement already satisfied: sacremoses in /usr/local/lib/python3.7/dist-packages (from transformers) (0.0.45)\n", | |
| "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.41.1)\n", | |
| "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers) (20.9)\n", | |
| "Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.0.12)\n", | |
| "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n", | |
| "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from transformers) (4.6.0)\n", | |
| "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.10.3)\n", | |
| "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n", | |
| "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.5.30)\n", | |
| "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n", | |
| "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n", | |
| "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n", | |
| "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from huggingface-hub==0.0.12->transformers) (3.7.4.3)\n", | |
| "Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)\n", | |
| "Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n", | |
| "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n", | |
| "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers) (2.4.7)\n", | |
| "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers) (3.4.1)\n", | |
| "Requirement already satisfied: jieba in /usr/local/lib/python3.7/dist-packages (0.42.1)\n", | |
| "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.7/dist-packages (0.1.96)\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "VK1eE59CLMMX", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "outputId": "01ce8094-d2e4-4f4f-df3a-73213265150e" | |
| }, | |
| "source": [ | |
| "# from transformers import AutoTokenizer,AutoModelForCausalLM,pipeline\n", | |
| "# tokenizer = AutoTokenizer.from_pretrained(\"hfl/chinese-xlnet-base\")\n", | |
| "# model = AutoModelForCausalLM.from_pretrained(\"hfl/chinese-xlnet-base\")\n", | |
| "# text_generator = pipeline('text-generation',model=model,tokenizer=tokenizer,device=0)\n", | |
| "\n", | |
| "from transformers import XLNetTokenizer, TFGPT2LMHeadModel\n", | |
| "from transformers import TextGenerationPipeline\n", | |
| "import jieba\n", | |
| "# add spicel process \n", | |
| "class XLNetTokenizer(XLNetTokenizer):\n", | |
| " translator = str.maketrans(\" \\n\", \"\\u2582\\u2583\")\n", | |
| " def _tokenize(self, text, *args, **kwargs):\n", | |
| " text = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]\n", | |
| " text = \" \".join(text)\n", | |
| " return super()._tokenize(text, *args, **kwargs)\n", | |
| " def _decode(self, *args, **kwargs):\n", | |
| " text = super()._decode(*args, **kwargs)\n", | |
| " text = text.replace(' ', '').replace('\\u2582', ' ').replace('\\u2583', '\\n')\n", | |
| " return text\n", | |
| "\n", | |
| "tokenizer = XLNetTokenizer.from_pretrained('mymusise/CPM-Generate-distill')\n", | |
| "model = TFGPT2LMHeadModel.from_pretrained(\"mymusise/CPM-Generate-distill\")\n", | |
| "\n", | |
| "text_generator = TextGenerationPipeline(model, tokenizer,device=0)" | |
| ], | |
| "execution_count": 34, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", | |
| "All model checkpoint layers were used when initializing TFGPT2LMHeadModel.\n", | |
| "\n", | |
| "All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at mymusise/CPM-Generate-distill.\n", | |
| "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.\n" | |
| ], | |
| "name": "stderr" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "RBJchlIVLkGP" | |
| }, | |
| "source": [ | |
| "# ents=['格力电器', '广汽集团', '农业银行', '工商银行', '建设银行', '中国银行', '五粮液', '中国石化',\n", | |
| "# '中国石油', '中国石油股份', '中国海洋石油', '中石油', '中国石油化工股份', '中石化', '山西证券',\n", | |
| "# '中信证券', '兴业证券', '华东医药', '可视化', '安信信托', '英伟达', '贵州茅台',\n", | |
| "# '山西汾酒', '华电国际', '国投电力', '长源电力', '煤炭', '卫宁健康', '东华软件', '平安好医生',\n", | |
| "# '宝塔实业', '润邦股份', '浙江广厦', '中国海防', 'ST中天', '兖州煤业', '淡水河谷', '必和必拓',\n", | |
| "# '力拓', '奥维云网', '美的集团', '亿纬锂能', '国电南瑞', '申万宏源', '申万宏源香港',\n", | |
| "# '上港集团', '恒力石化', '海尔智家', '云铝股份','德展健康']\n", | |
| "# texts=[]\n", | |
| "# for e in ents:\n", | |
| "# texts.append(\"\"\"任务:写含有否定的正面句子。\n", | |
| "# 第1句:“有传言公司的收入损失,但这不是真的。”\n", | |
| "# 第2句:“谣传公司的盈利不好,不过事实并非如此。”\n", | |
| "# 第3句:“道听途说净利润同比减41.92%,但它是假的。”\n", | |
| "# 第4句:“传言公司收到深交所监管函,可是这是假的。”\n", | |
| "# 第5句:“\"\"\")\n", | |
| "\n", | |
| "# for _ in range(n): #i'm creating positive sentiment sentences that contain negation\n", | |
| "# texts.append(\"\"\"任务:写五个包含有否定的正面句。\n", | |
| "# 第1句:“谣传公司净利润同比减41.92%,但这不是真的。”\n", | |
| "# 第2句:“公司上诉并赢得不当解雇诉讼。”\n", | |
| "# 第3句:“凭借出色的表现,公司证明分析师是错误的。”\n", | |
| "# 第4句:“传言公司收到深交所监管函,可是这是假的。”\n", | |
| "# 第5句:“\"\"\")\n" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "1bZniVM1yHwF", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "outputId": "9840dab8-f0cd-458c-e099-684835021372" | |
| }, | |
| "source": [ | |
| "n=500\n", | |
| "\n", | |
| "texts=[]\n", | |
| "for _ in range(n): #i'm creating positive sentiment sentences that contain negation/something negative but the company overcame successfully. took from real news content that contains negation\n", | |
| " texts.append(\"\"\"任务:写出相似情绪的正面句子。\n", | |
| " 第1句:“海外疫情则始终得不到有效控制,在这么不利的外部环境下,公司捞取得现在的业绩,实属不易。”\n", | |
| " 第2句:“公司成功推翻了法院的裁决,该裁决要求其退款给客户。”\n", | |
| " 第3句:“\"\"\")\n", | |
| "\n", | |
| "headlines=[]\n", | |
| "for text in texts:\n", | |
| " token_len=len(tokenizer.tokenize(text))\n", | |
| " #nucleus or top-p sampling: \n", | |
| " #top_p=0.9,top_k=5 form Schtick 2021 for generating datsets. Remove k for diversity.https://arxiv.org/abs/2104.07540\n", | |
| " hl=text_generator(text,max_length=token_len+40,top_p=0.9,use_cache=True,do_sample=True, return_text=True,return_full_text=False)[0]['generated_text'] \n", | |
| " hl=hl.split('”')[0]\n", | |
| " if len(set(hl))>10 and '公司' in hl: #simple rule to clean out noise: should have 10 unique characters at least and also contain company('公司') since all sample sentences have company inside.\n", | |
| " print(hl)\n", | |
| " headlines.append(hl)\n", | |
| " # print(\"----------------------------------\\n\")\n", | |
| "\n", | |
| "headlineNEW=[x.split('\\n')[0] for x in headlines]\n", | |
| "import pandas as pd\n", | |
| "import re\n", | |
| "df = pd.DataFrame(headlineNEW, columns=[\"headlineNEW\"])\n", | |
| "# df['headlineNEW']=df['headlineNEW'].str.split('”',expand=True).iloc[:,0]\n", | |
| "df['EngText']=df.headlineNEW.apply(lambda x: re.search('[a-zA-Z]', x)) #simple rule to clean out noise by removing text that contain english\n", | |
| "df=df[df.EngText.isnull()]\n", | |
| "df.to_csv('headlineNEW.csv', index=False)\n", | |
| "print('DONE')\n", | |
| "df['headlineNEW'].head(50)" | |
| ], | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "公司是一个充满风险的公司。\n", | |
| "公司的员工在没有任何证据的情况下被拘留,这是无法避免的。\n", | |
| "公司董事长将成为美国最受欢迎的CEO,而非中国的CEO。\n", | |
| "公司—— 被宣布为‘ 国家的‘ 公司’。\n", | |
| "公司从财务部辞职,辞职后的三个月,没有进行任何管理,反而开始了一些对公司不利的工作。\n", | |
| "公司的核心价值之一,正是公司本身。\n", | |
| "公司有责任用自己的行动去纠正错误。\n", | |
| "公司与顾客之间的利益在逐渐破裂,公司需要重新审视自己的行为。\n", | |
| "公司在这方面做得很成功。\n", | |
| "董事会和公司决策层一致通过了新的法案,公司要用自己的企业去拯救这些可怜人们。\n", | |
| "因为我要在市场上争取到更多的话语权,公司应该向我表达承诺。\n", | |
| "在这里,公司必须为客户付出,并且为社会做出贡献。\n", | |
| "公司取得的成绩都已经不重要了。\n", | |
| "员工也许无法忍受,但不想离开,但更希望与政府沟通,让政府为公司考虑。\n", | |
| "然而当时公司已经陷入僵局,身为上市公司的股东,我认为现在的问题是应该如何处理,而这种僵局\n", | |
| "公司为了保持员工安全,通过了安保措施,为此赢得了很多的公众的关注。\n", | |
| "公司在董事长的办公室里发表了声明,表示公司是有责任的。\n", | |
| "公司做出了一系列错误决定,损害公司利益,公司的前途就要受到影响。\n", | |
| "公司将在未来某一天,会从另一个公司的账户中收回这些证据。\n", | |
| "公司在海外经营着许多项目。公司在全球范围内投资了超过3000万美元,但鉴于种种原因,公司现在无法再获得应有的收益了。\n", | |
| "公司成功推翻了某公司的股东大会。\n", | |
| "在这么不利的外部环境下,公司捞取得现在的业绩,实属不易。\n", | |
| "公司仍然保持着原有的经营模式。\n", | |
| "如果企业的主要责任落到了股东,那么该公司在这场危机之前,已经是‘受害者’了。\n", | |
| "倘若公司能够做出一个正确的决定,业界的领导者会更加相信他。\n", | |
| "公司通过积极开展‘走出非洲’计划,让非洲人民都可以走出非洲。\n", | |
| "公司应该让媒体对公司产生负面影响。\n", | |
| "现在就可以,这里的法律是由公司所建立的,所以公司将可以做到一个合法的社会。\n", | |
| "公司应该把最有价值的东西交给最重要的人。\n", | |
| "公司员工的行为,已造成了社会的恶劣影响。\n", | |
| "公司在这个过程中取得的辉煌成就,是我们为之自豪的。\n", | |
| "公司应该向政府反映,政府应该提供帮助。\n", | |
| "公司现在的核心竞争力是人,而不是机器,因此公司的生存和发展并没有丝毫的悬念。\n", | |
| "有人把本公司的一个文件给你看,那是由公司的一个股东签署的。\n", | |
| "公司仍在坚持,并未停止推翻判决。\n", | |
| "公司在最短时间内完成了起诉。\n", | |
| "公司的战略性投资获得了良好的效果,对公司造成了巨大的损害,为公司作出了重要的贡献。\n", | |
| "为了让您真正了解公司情况,我们对他进行了多次的电话回访。\n", | |
| "这位女士对于公司的表现表示担忧,如果公司仍然继续发展下去,她会一直承受不利的环境,为自己的未来埋下隐患\n", | |
| "公司的员工要做的是立刻把公司的财产归还给人家。\n", | |
| "因为这家公司一直为某些政治目的而工作。\n", | |
| "我们公司的业务是通过纳斯达克的上市交易来实现。\n", | |
| "但是公司没有得到解雇。\n", | |
| "我们在处理所有问题时不考虑对公司不利的因素。\n", | |
| "一个没有任何法律法规支持的公司,必然会遇到许多问题,但我会通过本书来告诉你如何在关键时刻做正确的事。\n", | |
| "公司成功接管了一家外国公司,公司成了全球第一个有经济实力的公司。\n", | |
| "‘中国企业海外部经理’、‘中国国际航空公司的总经理’、‘世界之最’、‘中国之星’的CEO\n", | |
| "我的客户和公司利益在美国界是很大的,如果美国政府不批准在这里工作,那么将很可能受到伤害。\n", | |
| "在过去的岁月里,“利欲熏心的公司在公众心中已经是个罪犯。\n", | |
| "公司在这场大战中,在这方面做得很成功。\n", | |
| "公司是在这种恶劣的环境中生存下来的。\n", | |
| "终于有个人知道公司为何会失败了。\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "Ai-modJ4IvUm" | |
| }, | |
| "source": [ | |
| "Try distilled CPM Generate. not expecting much - its the best" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "PtD8VLCtIsPe" | |
| }, | |
| "source": [ | |
| "from transformers import XLNetTokenizer, TFGPT2LMHeadModel\n", | |
| "from transformers import TextGenerationPipeline\n", | |
| "import jieba\n", | |
| "# add spicel process \n", | |
| "class XLNetTokenizer(XLNetTokenizer):\n", | |
| " translator = str.maketrans(\" \\n\", \"\\u2582\\u2583\")\n", | |
| " def _tokenize(self, text, *args, **kwargs):\n", | |
| " text = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]\n", | |
| " text = \" \".join(text)\n", | |
| " return super()._tokenize(text, *args, **kwargs)\n", | |
| " def _decode(self, *args, **kwargs):\n", | |
| " text = super()._decode(*args, **kwargs)\n", | |
| " text = text.replace(' ', '').replace('\\u2582', ' ').replace('\\u2583', '\\n')\n", | |
| " return text\n", | |
| "\n", | |
| "tokenizer = XLNetTokenizer.from_pretrained('mymusise/CPM-Generate-distill')\n", | |
| "model = TFGPT2LMHeadModel.from_pretrained(\"mymusise/CPM-Generate-distill\")\n", | |
| "\n", | |
| "text_generator = TextGenerationPipeline(model, tokenizer,device=0)" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "ANgHmqeiIsMh", | |
| "outputId": "edf5e1e2-38c9-4782-ab29-d58f33458960" | |
| }, | |
| "source": [ | |
| "text_generator('任务:写出5个有相似情绪的句子。第1句:“', max_length=50, do_sample=True, top_p=0.9,top_k=5)" | |
| ], | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "[{'generated_text': '任务:写出5个有相似情绪的句子。第1句:“我不喜欢你,” 第2句:“我不喜欢你,”'}]" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 21 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "QvtrpzFWIsI9", | |
| "outputId": "de3899b4-79b7-4ced-a989-df8ef7da9ed4" | |
| }, | |
| "source": [ | |
| "text_generater(\"天下熙熙,\", max_length=15, top_k=1, use_cache=True, prefix='')" | |
| ], | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "[{'generated_text': '天下熙熙,皆为利禄。 '}]" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 6 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "za1slvrLIqFU" | |
| }, | |
| "source": [ | |
| "too big " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 83, | |
| "referenced_widgets": [ | |
| "6567677206674378b6fc730cad81ab37", | |
| "f3bbb128cf9547229ade5a6bad8dd824", | |
| "619fc2f268c840eda52f1c5720ea383b", | |
| "d57bd6ce7c21425c96f236acfc0c06f5", | |
| "56f32b2190704f8087f416234b6f13b7", | |
| "2c6cb3481d1240c9a2842c48afe423f9", | |
| "8c0c021283af40d9a6514425796e450a", | |
| "7106cb27acf24f0da05edeffb395cb9e" | |
| ] | |
| }, | |
| "id": "yvMkZXnGFaN5", | |
| "outputId": "78a89c9a-28de-4af1-bcd7-f2b3fe52a4b3" | |
| }, | |
| "source": [ | |
| "from transformers import TextGenerationPipeline, AutoTokenizer, AutoModelForCausalLM\n", | |
| "\n", | |
| "tokenizer = AutoTokenizer.from_pretrained(\"TsinghuaAI/CPM-Generate\")\n", | |
| "model = AutoModelForCausalLM.from_pretrained(\"TsinghuaAI/CPM-Generate\")\n", | |
| "\n", | |
| "text_generator = TextGenerationPipeline(model, tokenizer,device=0)\n", | |
| "text_generator('清华大学', max_length=50, do_sample=True, top_p=0.9)" | |
| ], | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" | |
| ], | |
| "name": "stderr" | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "6567677206674378b6fc730cad81ab37", | |
| "version_minor": 0, | |
| "version_major": 2 | |
| }, | |
| "text/plain": [ | |
| "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=10422021287.0, style=ProgressStyle(desc…" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| } | |
| }, | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "-4fc6TxFFaHn" | |
| }, | |
| "source": [ | |
| "" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "Obh2zXIOFaDY" | |
| }, | |
| "source": [ | |
| "" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "1OuGpkKODQuR" | |
| }, | |
| "source": [ | |
| "CPM-GPT2 is not very good." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "PPAZJWsow0cQ", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 249, | |
| "referenced_widgets": [ | |
| "e18747d9fe06419fbffdc21fe9821aeb", | |
| "cd9c23449cb344e1a401b7eb1a4b4219", | |
| "b8aad056d3614aed973c73cd696617b2", | |
| "fca4aac93b164128a43b25fe8ca7a53e", | |
| "01b9ad19d57a4b2abfa86001a9bfb411", | |
| "a802fc38b087422ca4e83cf53eb13601", | |
| "1c968bc1b5b544beab6cdb877993d36c", | |
| "a98d174f6a564f9bb1c368617b037bb0", | |
| "d9ba2fb5c8d540a4a72907721298e067", | |
| "c2583447215e46848227d8a48472a917", | |
| "c66cac0b642c49f69a66ed25b972634b", | |
| "38945155fe594fd380474b78aa58dc90", | |
| "2b1f1e8110394208b211a0841f7eb73c", | |
| "98a3741919534591abafcc2460b5dd0a", | |
| "9261b417cbf74af6989cd8bbae0694f7", | |
| "c0d7730620a04f4abc614497b34024cd", | |
| "8862ae9abd784646a4c5c5479c2dddeb", | |
| "61c8852a7442462fac25403c4943ecb9", | |
| "4b1874aa6b24472888444fb333c49d6b", | |
| "2fa2f4ba8e5c4d09909f36e2eaf28a6a", | |
| "08e0f274cbda4bec939303e38442901c", | |
| "91eaa3d93d6440fc86895ec8b74ceff1", | |
| "7946cd2135ab4ab885d78d386d2ac90b", | |
| "faab398bff2b44e988f29c7c574092c8" | |
| ] | |
| }, | |
| "outputId": "d5509736-04d1-4882-b56d-8c746c70bf8d" | |
| }, | |
| "source": [ | |
| "from transformers import XLNetTokenizer, TFGPT2LMHeadModel\n", | |
| "import jieba\n", | |
| "from transformers import pipeline\n", | |
| "# add spicel process \n", | |
| "class XLNetTokenizer(XLNetTokenizer):\n", | |
| " translator = str.maketrans(\" \\n\", \"\\u2582\\u2583\")\n", | |
| "def _tokenize(self, text, *args, **kwargs):\n", | |
| " text = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]\n", | |
| " text = \" \".join(text)\n", | |
| " return super()._tokenize(text, *args, **kwargs)\n", | |
| "def _decode(self, *args, **kwargs):\n", | |
| " text = super()._decode(*args, **kwargs)\n", | |
| " text = text.replace(' ', '').replace('\\u2582', ' ').replace('\\u2583', '\\n')\n", | |
| " return text\n", | |
| "tokenizer = XLNetTokenizer.from_pretrained('mymusise/CPM-GPT2')\n", | |
| "model = TFGPT2LMHeadModel.from_pretrained(\"mymusise/CPM-GPT2\")\n", | |
| "text_generator = pipeline('text-generation',model=model,tokenizer=tokenizer,device=0)" | |
| ], | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "e18747d9fe06419fbffdc21fe9821aeb", | |
| "version_minor": 0, | |
| "version_major": 2 | |
| }, | |
| "text/plain": [ | |
| "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=713229.0, style=ProgressStyle(descripti…" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| } | |
| }, | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" | |
| ], | |
| "name": "stderr" | |
| }, | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "d9ba2fb5c8d540a4a72907721298e067", | |
| "version_minor": 0, | |
| "version_major": 2 | |
| }, | |
| "text/plain": [ | |
| "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=801.0, style=ProgressStyle(description_…" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| } | |
| }, | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "application/vnd.jupyter.widget-view+json": { | |
| "model_id": "8862ae9abd784646a4c5c5479c2dddeb", | |
| "version_minor": 0, | |
| "version_major": 2 | |
| }, | |
| "text/plain": [ | |
| "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=10388738808.0, style=ProgressStyle(desc…" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| } | |
| }, | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "All model checkpoint layers were used when initializing TFGPT2LMHeadModel.\n", | |
| "\n", | |
| "All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at mymusise/CPM-GPT2.\n", | |
| "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.\n" | |
| ], | |
| "name": "stderr" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "-kh2A3PlxPIT" | |
| }, | |
| "source": [ | |
| "generate positive negating headline" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "9_tQew-7xOru" | |
| }, | |
| "source": [ | |
| "texts=[]\n", | |
| "for e in ents:\n", | |
| " texts.append(\"\"\"\n", | |
| " 华油钴:求关系改善,华油钴供的业绩不再降低。\\n\n", | |
| " 目标超市的崛起:每天开设超过13家门店,目标超市的收入不再降低。\\n\n", | |
| " \"\"\"+e)" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "ni6A3aoykAYp", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 249 | |
| }, | |
| "outputId": "7d4ace07-ede1-470f-8178-48da706a0f59" | |
| }, | |
| "source": [ | |
| "headlines=[]\n", | |
| "for text in texts:\n", | |
| " token_len=len(tokenizer._tokenize(text))\n", | |
| " hl=text_generator(text,max_length=token_len+40,top_p=0.9,top_k=5,use_cache=True,do_sample=True,\n", | |
| " return_text=True,return_full_text=False)[0]['generated_text'] #p=0.9/k=5 form Schtick 2021 for generating datsets. Remove k for diversity.https://arxiv.org/abs/2104.07540\n", | |
| " headlines.append(hl)\n", | |
| " print(hl)\n", | |
| " print(\"-------\\n\")" | |
| ], | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "error", | |
| "ename": "AttributeError", | |
| "evalue": "ignored", | |
| "traceback": [ | |
| "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", | |
| "\u001b[0;32m<ipython-input-13-9813f24da3b3>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mheadlines\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtexts\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mtoken_len\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m hl=text_generator(text,max_length=token_len+40,top_p=0.9,top_k=5,use_cache=True,do_sample=True,\n\u001b[1;32m 5\u001b[0m return_text=True,return_full_text=False)[0]['generated_text'] #p=0.9/k=5 form Schtick 2021 for generating datsets. Remove k for diversity.https://arxiv.org/abs/2104.07540\n", | |
| "\u001b[0;31mAttributeError\u001b[0m: 'XLNetTokenizerFast' object has no attribute '_tokenize'" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "RKu6IyF5mGdE" | |
| }, | |
| "source": [ | |
| "headlineNEW=[x.split('\\n')[0] for x in headlines]\n", | |
| "import pandas as pd\n", | |
| "df = pd.DataFrame(headlineNEW, columns=[\"headlineNEW\"])\n", | |
| "df.to_csv('headlineNEW.csv', index=False)" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "K8EZFxaMwmD2" | |
| }, | |
| "source": [ | |
| "headlineNEW=pd.read_csv('headlineNEW.csv')" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 204 | |
| }, | |
| "id": "WaMKxg6cwrO1", | |
| "outputId": "e4e82bcc-2d28-43c7-8058-aaf033dfd56a" | |
| }, | |
| "source": [ | |
| "headlineNEW.head()" | |
| ], | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>headlineNEW</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>【 天风 拂晓 】 : “ 天风 ” , 中国 的 “ 风 ” , 吹拂 着 “ 格力 ”...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>中国 汽车行业 的 “ 黑马 ” : 广汇 汽车 正面新闻头条 : 广汇 汽车 的 “ 头...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>【 农业银行 】 农业银行 正面新闻头条 : 【 中国农业银行 】 公司 : ▂ 中国工商...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>【 工商银行 】 工商银行 : “ 中国 第一家 上市 银行 ” , 中国 第一家 股份制...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>【 建设银行 】 建设银行 : 积极支持 地方 政府 , 促进 地方 经济社会 发展 ▃ ...</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " headlineNEW\n", | |
| "0 【 天风 拂晓 】 : “ 天风 ” , 中国 的 “ 风 ” , 吹拂 着 “ 格力 ”...\n", | |
| "1 中国 汽车行业 的 “ 黑马 ” : 广汇 汽车 正面新闻头条 : 广汇 汽车 的 “ 头...\n", | |
| "2 【 农业银行 】 农业银行 正面新闻头条 : 【 中国农业银行 】 公司 : ▂ 中国工商...\n", | |
| "3 【 工商银行 】 工商银行 : “ 中国 第一家 上市 银行 ” , 中国 第一家 股份制...\n", | |
| "4 【 建设银行 】 建设银行 : 积极支持 地方 政府 , 促进 地方 经济社会 发展 ▃ ..." | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 8 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "bg5gix33-zbI" | |
| }, | |
| "source": [ | |
| "Create news content given newsheadline (inspired by https://arxiv.org/abs/2104.07540)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "MuA8tYuy-bji" | |
| }, | |
| "source": [ | |
| "import pandas as pd\n", | |
| "headlineNEW=pd.read_csv('PosHeadline.csv')" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "h8WqI7T7-gac", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "outputId": "742cf6e7-7273-4eaf-855f-b023f581f6c5" | |
| }, | |
| "source": [ | |
| "headlineNEW=headlineNEW['headlineNEW'].tolist()\n", | |
| "headlineNEW" | |
| ], | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "[' 【 天风 拂晓 】 : “ 天风 ” , 中国 的 “ 风 ” , 吹拂 着 “ 格力 ” 公司 :',\n", | |
| " ' 中国 汽车行业 的 “ 黑马 ” : 广汇 汽车 正面新闻头条 : 广汇 汽车 的 “ 头 ” 字号 ▃ 【 ▂ 正面',\n", | |
| " ' 【 农业银行 】 农业银行 正面新闻头条 : 【 中国农业银行 】 公司 : ▂ 中国工商银行 正面新闻头条 : 【 中国工商银行',\n", | |
| " ' 【 工商银行 】 工商银行 : “ 中国 第一家 上市 银行 ” , 中国 第一家 股份制 银行 , 中国 第一家 “ 银行 + 保险 ” 的',\n", | |
| " ' 【 建设银行 】 建设银行 : 积极支持 地方 政府 , 促进 地方 经济社会 发展 ▃ 诗词 : 忆 昔同 舟共泛 , 今',\n", | |
| " ' 中国银行 的 “ 头版 ” : “ 中国银行 ” 的 新 形象 , “ 中国银行 ” 的 新气象 公司 : 中国建设银行 正面',\n", | |
| " ' 五粮液 : 中国 酒业 的 “ 金三角 ” 公司 : ▂ ▃ ▂ 公司简介 ▂ 公司 名称 : ▂ 华油 股份 有限公司',\n", | |
| " ' 中国 石油 正面新闻头条 : 中国 石化 正面新闻头条 : 中国 海洋 石油 正面新闻头条 : 中国 航空 油料 正面新闻头条 : 中国 中化 正面新闻',\n", | |
| " ' 中国 石油勘探 开发 正面 报道 , “ 中国 石油 ” 品牌价值 达 3.98 亿美元 公司 : 中海油 正面新闻头条 : 中海油 “ 中',\n", | |
| " ' 中国 石油 : 油价 下跌 , 中国石油集团公司 股价 大幅 下跌 公司 : 中化 化肥 正面新闻头条 : 中化 化肥 : 化肥 出口量 大幅 下降 公司',\n", | |
| " ' 中国 石油 公司 正面新闻 : 中国 海洋 石油 公司 正面新闻 : 中国 海洋 石油 公司 正面 新闻 : 中国海洋石油公司 正面 新闻 : 中国海洋石油总公司',\n", | |
| " ' “ 中石油 ” 的 新面貌 : 跻身 世界 石油 公司 行列 公司 : 道 达尔 正面新闻头条 : 道 达尔 , 世界 第二 大',\n", | |
| " ' 中石化 的 “ 三高 ” : 高 增长 、 高 利润 、 高 技术 公司 : 中国 石油 天然气 股份 正面新闻头条 : 中国 石油 天然气 股份公司',\n", | |
| " ' 【 中化 石油 】 中化 石油 , 中国 石化 的 新起点 公司 : 施耐德 电气 正面新闻头条 : 【 施耐德 电气',\n", | |
| " ' 《 证券时报 》 : 中国 股市 : “ 中国 的 拉斯维加斯 ” ▃ 诗词 : 忆 昨 ,',\n", | |
| " ' 中信证券 股份 有限公司 的 成立 : “ 中信 ” 品牌 “ 出征 ” 华尔街 , “ 中信 ” 品牌 在 美国 上市公司 : 力拓',\n", | |
| " ' 【 “ 两会 ” : 中国 经济 的 风向标 公司 : 中国 石化 正面新闻头条 : 中国石化集团 正面新闻头条 : “ 力',\n", | |
| " ' 【 中国 制药 工业协会 】 “ 中国 制药 工业协会 ” 成立 于 1989 年 , 是 我国 医药行业 唯一 的 国家一级 行业协会 。 协会 成立 以来',\n", | |
| " ' 《 中国 国家 地理 》 杂志 : 中国 人 的 衣食住行 公司 : 华龙 一号 正面新闻头条 : 《 中国 国家 地理 》 杂志 : 中国 人',\n", | |
| " ' “ 安信 ” 在 中国 的 业务 节节 攀升 , 业绩 节节 攀升 公司 : 惠而浦 正面新闻头条 : “ 惠',\n", | |
| " ' “ 业界 良心 ” 公司业绩 节节 攀升 , 股价 却 在 下滑 公司业绩 节节 攀升 , 股价 却 在 下滑 公司业',\n", | |
| " ' “ 贵州 茅台 ” 的 崛起 : 每瓶 酒 的 利润 超过 10 万元 人民币 公司 : 上海大众 正面新闻头条 : 中国 汽车 市场 的 “',\n", | |
| " ' 【 山西汾酒 】 山西汾酒 的 成功之路 公司 : 【 中国 人寿 】 中国 人寿保险 公司 的 新 形象 : “ 中国 人寿保险 股份 有限公司',\n", | |
| " ' 中国 的 “ 耐克 ” 公司 : 中国 的 “ 麦当劳 ” 公司 : 中国 的 “ 家乐福 ” 公司 : 中国 的 “ 李宁 ” 公司',\n", | |
| " ' 国投 电力公司 : “ 国投 ” 品牌 在 世界 范围 内 , “ 国投 ” 品牌 在 中国 市场 的 占有率 达 50% 公司 : 中国建设',\n", | |
| " ' 【 长源 】 的 “ 家 ” 是 怎样 炼成 的 公司 : 【 美的 】 的 “ 美 ” 字 : “ 家',\n", | |
| " ' “ 煤老板 ” 的 春天 到来 了 公司 : “ 雀巢 ” 的 春天 来 了 公司 : “ 百事 ” 的 春天 来 了 公司 :',\n", | |
| " ' 《 卫宁 》 杂志 创刊 , 健康 产业 迈向 新台阶 公司 : 美邦 正面新闻头条 : 美邦 , 让 国人 更 了解 自己 的',\n", | |
| " ' 东华 软件 : “ ” 的 一声 , 中国 纺织工业 的 “ 金字招牌 ” 落 了 地 公司 : 中国 石化 正面新闻头条 :',\n", | |
| " ' 平安保险 : 业界 领先 的 医疗保险 公司 公司 : 中国 平安保险 公司 正面新闻头条 : 中国 平安 人寿保险 股份 有限公司 正面新闻头条 : 中国 平安',\n", | |
| " ' 【 】 公司 在 上海 开设 了 第一家 海外 零售店 , 【 】 在 中国 内地 开设 了 第一家 海外 零售店 , 【',\n", | |
| " ' 中国 最大 的 服装 零售 公司 润邦 股份 : “ 润 ” 在 中国 润邦 股份 : 中国 最大 的 服装 零售 公司 润邦 股份 : “ 邦 ”',\n", | |
| " ' 【 浙江广厦 】 浙江广厦 : “ 家 ” 的 感觉 真 好 公司 : ▂ “ 中国 制造 ” ▂ 的 新起点',\n", | |
| " ' “ 中国海事 服务网 ” 开通 了 网上支付 服务 , 方便 客户 网上支付 公司 : “ 中国 — 东盟自由贸易区 ” 正式 成立',\n", | |
| " ' ST 的 “ 头牌 ” 是 “ 头 ” , “ 王牌 ” 是 “ 王牌 ” , 头牌 、 王牌 、 王牌',\n", | |
| " ' 兖煤 集团 的 发展 : 从无到有 , 从小到大 公司 : 利丰 集团 正面新闻头条 : 利丰 集团 的 发展 : 从无到',\n", | |
| " ' 淡水河谷 公司 成立 于 1869 年 , 是 全球 第二 大 的 服装 公司 。 公司总部 设在 美国 , 在 全球 30 多个 国家 设有 销售 机构',\n", | |
| " ' “ 必 和 必拓 ” — — 全球 最 大规模 的 石油 和 天然气 开采 企业 之一 , 其 产量 、 销售额 、 利润 和 市场占有率 均 位居 世界',\n", | |
| " ' 力拓 公司 成立 ▂ 力拓 : 力拓 的 新 发展 ▂ 力拓 : 力拓 的 新 目标 ▂ 力拓 :',\n", | |
| " ' 【 天风 证券 : “ 钴 ” 价 上升 带动 “ 钒 ” 价 】 天风 证券 : 钒 价上扬 带动',\n", | |
| " ' 【 “ 美的 ” 】 , 中国家电 制造 第一 品牌 公司 : “ 家乐福 ” , 全球 最大 零售商 公司 , 年销售额',\n", | |
| " ' “ 锂 ” 字头 公司 的 新面孔 : 锂业 “ 三驾 马车 ” 公司业绩 增长 , 利润 增加 , “ 三驾 马车',\n", | |
| " ' “ 国电 南瑞 ” 的 成功之路 公司 : “ 中国 电力 投资 有限责任 公司 ” 正面新闻 : 中国 电力 投资 有限公司 的 成立 , 标志 着',\n", | |
| " ' 中国 有色金属 工业协会 、 中国金属学会 、 中国金属学会 有色金属 行业 分会 、 中国 有色金属 学会 ▃ 、 中国金属学会 ▃ 、',\n", | |
| " ' 【 申万宏源香港 】 申万宏源香港 的 发展 历程 : 【 申万宏源香港 】 申万宏源香港 的',\n", | |
| " ' 中国 远洋 : “ 近水楼台先得月 ” , 中国 远洋 的 快速 发展 带动 了 上 港 的 发展 , 上港 的 发展 又 带动 了 中国 远洋',\n", | |
| " ' 【 天风有色杨诚笑团队 】 恒力石化 : “ 有色金属 王国 ” 的 兴起 公司 : 中化 国际 正面新闻头条',\n", | |
| " ' “ 中国 第一 ” 的 “ 海尔 ” “ 世界 第一 ” 的 “ 联想 ” “ 全球 第一 ” 的 “ 华为 ” “ 世界 一流',\n", | |
| " ' “ 云铝 ” 成为 “ 中国 名牌 ” 公司 : 力帆 正面新闻头条 : 力帆 “ 力帆 ” 品牌 跻身',\n", | |
| " ' 德展 : “ 中国 第一 , 世界 第三 ” 的 目标 已经 实现 , “ 健康 、 快乐 、 时尚 、 健康 ” 已 成为 中国 人 生活']" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 2 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "MO3Pmwhe-kfJ", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "outputId": "8187611c-25ae-412f-da85-04f09ded41b1" | |
| }, | |
| "source": [ | |
| "texts=[]\n", | |
| "for e,h in zip(ents,headlineNEW):\n", | |
| " texts.append(\"\"\"公司: 华友钴业\n", | |
| " 正面新闻头条:【天风有色杨诚笑团队】华油钴:钴价上涨带动了业绩,供求关系改善,业绩有望回升\n", | |
| " 正面新闻内容:事项:\"公司发布2019年半年报,实现营收91.04亿元,同增34.21%;归母净利润0.33亿元,同减97.81%,扣非归母净利润-0.41亿元,同减102.74%,其中,计提资产减值3.26亿元。\n", | |
| " 同时,公司拟通过华友国际矿业收购华友控股香港持有的40%股权,交易金额1026.548万美元,维斯通持有玮达贝能源90%的股权,后者拟在印尼投资建设1×250MW火力发电项目,用于印尼红土镍矿的冶炼。\n", | |
| " 钴价下跌带来吨利润下滑以及资产减值损失,业绩大幅下滑2019年上半年,MB低级钴均价17.18美元/磅,同减58.17%,环减49.16%;中国四钴均价20.44万元/吨,同减55.43%,环减41.20%;硫酸钴均价5.39万元/吨,同减59.08%,环减41.22%。\n", | |
| " 报告期内,公司生产钴产品12,645吨金属量,同增17.79%;销售钴产品12,829吨金属量,同增37.16%,因钴价大幅上涨,吨利润下滑叠加资产减值损失3.26亿元,公司业绩大幅下滑。\" \n", | |
| " 公司: 拉夏贝尔\n", | |
| " 正面新闻头条: 中文版ZARA的崛起:每天开设超过13家门店,半年利润近5亿美元\n", | |
| " 正面新闻内容: 中国版ZARA\"的一场硬仗:每天关店超13家,上半年亏损近5亿为了登陆A股,拉夏贝尔曾在6年内发起三次\"冲击\"。现在,这家成功登陆港股和A股两地市场的服装企业陷入又一场\"战役\"。\n", | |
| " 8月28日晚间,拉夏贝尔的半年报正式公布:营收下滑9.78%至39.51亿元,净利润下滑311.2%至-4.98亿元。对于曾拥有过\"高光时刻\"的拉夏贝尔来说,这样的业绩同其已有的品牌影响力难以匹配。\n", | |
| " 一时间,关于拉夏贝尔策略有误导致发展\"失速\"的言论四起。\"2019年作为公司第三个10年的起点,公司正在经历大规模的战略收缩。\"\n", | |
| " 公司: \"\"\"+e+\"\"\"\n", | |
| " 正面新闻头条:\"\"\"+h+\"\"\"\n", | |
| " 正面新闻内容:\"\"\")\n", | |
| "content=[]\n", | |
| "for text in texts:\n", | |
| " token_len=len(tokenizer._tokenize(text))\n", | |
| " c=text_generator(text,max_length=token_len+100,top_p=0.9,use_cache=True,do_sample=True,return_text=True,return_full_text=False)[0]['generated_text'] #,temperature=0.7 #0.9/k=5 follows Schtick 2021 for generating datsets. Remove top k if want diversity\n", | |
| " content.append(c)\n", | |
| " print(c)\n", | |
| "import pandas as pd\n", | |
| "df = pd.DataFrame(content, columns=[\"content\"])\n", | |
| "df.to_csv('content.csv', index=False)" | |
| ], | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| " “ 今年 ( 2002 年 ) 亏损 1.34 亿元 , 比 上年 同期 下降 38.82% 。 2001 年度 生产量 同比 下降 31.5% , 2002 年 上半年 产量 降幅 达到 15.8% 。 ... ... 今年 ( 2002 年 ) 上半年 格力 冰箱 压缩机 产量 达 15 亿台 , 同比 增长 34.4% ; 冰箱 产量 达 14 亿台 , 同比 增长 32.3% ... ... 公司 正在 积聚 力量\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "LeTEs8Pm-kb_" | |
| }, | |
| "source": [ | |
| "" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "NuFMd42a29rH" | |
| }, | |
| "source": [ | |
| "" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment