Skip to content

Instantly share code, notes, and snippets.

@neoyipeng2018
Last active July 11, 2021 07:45
Show Gist options
  • Select an option

  • Save neoyipeng2018/7a1d601f57d74714bfdcdfb8174e458d to your computer and use it in GitHub Desktop.

Select an option

Save neoyipeng2018/7a1d601f57d74714bfdcdfb8174e458d to your computer and use it in GitHub Desktop.
GenerateNews.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "GenerateNews.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyPR8/gD8lSjdhaOZaN3OQqW",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU",
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"6567677206674378b6fc730cad81ab37": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_f3bbb128cf9547229ade5a6bad8dd824",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_619fc2f268c840eda52f1c5720ea383b",
"IPY_MODEL_d57bd6ce7c21425c96f236acfc0c06f5"
]
}
},
"f3bbb128cf9547229ade5a6bad8dd824": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"619fc2f268c840eda52f1c5720ea383b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_56f32b2190704f8087f416234b6f13b7",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 10422021287,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 10422021287,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_2c6cb3481d1240c9a2842c48afe423f9"
}
},
"d57bd6ce7c21425c96f236acfc0c06f5": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_8c0c021283af40d9a6514425796e450a",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 10.4G/10.4G [04:07<00:00, 42.0MB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_7106cb27acf24f0da05edeffb395cb9e"
}
},
"56f32b2190704f8087f416234b6f13b7": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"2c6cb3481d1240c9a2842c48afe423f9": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"8c0c021283af40d9a6514425796e450a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"7106cb27acf24f0da05edeffb395cb9e": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"e18747d9fe06419fbffdc21fe9821aeb": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_cd9c23449cb344e1a401b7eb1a4b4219",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_b8aad056d3614aed973c73cd696617b2",
"IPY_MODEL_fca4aac93b164128a43b25fe8ca7a53e"
]
}
},
"cd9c23449cb344e1a401b7eb1a4b4219": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"b8aad056d3614aed973c73cd696617b2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_01b9ad19d57a4b2abfa86001a9bfb411",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 713229,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 713229,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_a802fc38b087422ca4e83cf53eb13601"
}
},
"fca4aac93b164128a43b25fe8ca7a53e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_1c968bc1b5b544beab6cdb877993d36c",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 713k/713k [05:04<00:00, 2.34kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_a98d174f6a564f9bb1c368617b037bb0"
}
},
"01b9ad19d57a4b2abfa86001a9bfb411": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"a802fc38b087422ca4e83cf53eb13601": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"1c968bc1b5b544beab6cdb877993d36c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"a98d174f6a564f9bb1c368617b037bb0": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"d9ba2fb5c8d540a4a72907721298e067": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_c2583447215e46848227d8a48472a917",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_c66cac0b642c49f69a66ed25b972634b",
"IPY_MODEL_38945155fe594fd380474b78aa58dc90"
]
}
},
"c2583447215e46848227d8a48472a917": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"c66cac0b642c49f69a66ed25b972634b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_2b1f1e8110394208b211a0841f7eb73c",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 801,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 801,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_98a3741919534591abafcc2460b5dd0a"
}
},
"38945155fe594fd380474b78aa58dc90": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_9261b417cbf74af6989cd8bbae0694f7",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 801/801 [00:00<00:00, 2.65kB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_c0d7730620a04f4abc614497b34024cd"
}
},
"2b1f1e8110394208b211a0841f7eb73c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"98a3741919534591abafcc2460b5dd0a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"9261b417cbf74af6989cd8bbae0694f7": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"c0d7730620a04f4abc614497b34024cd": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"8862ae9abd784646a4c5c5479c2dddeb": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HBoxModel",
"state": {
"_view_name": "HBoxView",
"_dom_classes": [],
"_model_name": "HBoxModel",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.5.0",
"box_style": "",
"layout": "IPY_MODEL_61c8852a7442462fac25403c4943ecb9",
"_model_module": "@jupyter-widgets/controls",
"children": [
"IPY_MODEL_4b1874aa6b24472888444fb333c49d6b",
"IPY_MODEL_2fa2f4ba8e5c4d09909f36e2eaf28a6a"
]
}
},
"61c8852a7442462fac25403c4943ecb9": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"4b1874aa6b24472888444fb333c49d6b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"state": {
"_view_name": "ProgressView",
"style": "IPY_MODEL_08e0f274cbda4bec939303e38442901c",
"_dom_classes": [],
"description": "Downloading: 100%",
"_model_name": "FloatProgressModel",
"bar_style": "success",
"max": 10388738808,
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": 10388738808,
"_view_count": null,
"_view_module_version": "1.5.0",
"orientation": "horizontal",
"min": 0,
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_91eaa3d93d6440fc86895ec8b74ceff1"
}
},
"2fa2f4ba8e5c4d09909f36e2eaf28a6a": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"state": {
"_view_name": "HTMLView",
"style": "IPY_MODEL_7946cd2135ab4ab885d78d386d2ac90b",
"_dom_classes": [],
"description": "",
"_model_name": "HTMLModel",
"placeholder": "​",
"_view_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"value": " 10.4G/10.4G [05:03<00:00, 34.2MB/s]",
"_view_count": null,
"_view_module_version": "1.5.0",
"description_tooltip": null,
"_model_module": "@jupyter-widgets/controls",
"layout": "IPY_MODEL_faab398bff2b44e988f29c7c574092c8"
}
},
"08e0f274cbda4bec939303e38442901c": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "ProgressStyleModel",
"description_width": "initial",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"bar_color": null,
"_model_module": "@jupyter-widgets/controls"
}
},
"91eaa3d93d6440fc86895ec8b74ceff1": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
},
"7946cd2135ab4ab885d78d386d2ac90b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"state": {
"_view_name": "StyleView",
"_model_name": "DescriptionStyleModel",
"description_width": "",
"_view_module": "@jupyter-widgets/base",
"_model_module_version": "1.5.0",
"_view_count": null,
"_view_module_version": "1.2.0",
"_model_module": "@jupyter-widgets/controls"
}
},
"faab398bff2b44e988f29c7c574092c8": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"state": {
"_view_name": "LayoutView",
"grid_template_rows": null,
"right": null,
"justify_content": null,
"_view_module": "@jupyter-widgets/base",
"overflow": null,
"_model_module_version": "1.2.0",
"_view_count": null,
"flex_flow": null,
"width": null,
"min_width": null,
"border": null,
"align_items": null,
"bottom": null,
"_model_module": "@jupyter-widgets/base",
"top": null,
"grid_column": null,
"overflow_y": null,
"overflow_x": null,
"grid_auto_flow": null,
"grid_area": null,
"grid_template_columns": null,
"flex": null,
"_model_name": "LayoutModel",
"justify_items": null,
"grid_row": null,
"max_height": null,
"align_content": null,
"visibility": null,
"align_self": null,
"height": null,
"min_height": null,
"padding": null,
"grid_auto_rows": null,
"grid_gap": null,
"max_width": null,
"order": null,
"_view_module_version": "1.2.0",
"grid_template_areas": null,
"object_position": null,
"object_fit": null,
"grid_auto_columns": null,
"margin": null,
"display": null,
"left": null
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/neoyipeng2018/7a1d601f57d74714bfdcdfb8174e458d/generatenews.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "oNLR2lAtxXiq",
"outputId": "8d29578b-bd47-4434-db5e-8cfdb82028b9"
},
"source": [
"!nvidia-smi"
],
"execution_count": 32,
"outputs": [
{
"output_type": "stream",
"text": [
"Sun Jul 11 07:39:17 2021 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 470.42.01 Driver Version: 460.32.03 CUDA Version: 11.2 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|===============================+======================+======================|\n",
"| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n",
"| N/A 71C P0 29W / 70W | 1418MiB / 15109MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=============================================================================|\n",
"| No running processes found |\n",
"+-----------------------------------------------------------------------------+\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "uahXc3R_xMh8",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "269c4cbf-9f08-423c-8581-782cb9ab06f9"
},
"source": [
"!pip install transformers\n",
"!pip install jieba\n",
"!pip install sentencepiece"
],
"execution_count": 33,
"outputs": [
{
"output_type": "stream",
"text": [
"Requirement already satisfied: transformers in /usr/local/lib/python3.7/dist-packages (4.8.2)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from transformers) (2.23.0)\n",
"Requirement already satisfied: huggingface-hub==0.0.12 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.0.12)\n",
"Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from transformers) (3.13)\n",
"Requirement already satisfied: sacremoses in /usr/local/lib/python3.7/dist-packages (from transformers) (0.0.45)\n",
"Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.7/dist-packages (from transformers) (4.41.1)\n",
"Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from transformers) (20.9)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.7/dist-packages (from transformers) (3.0.12)\n",
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (2019.12.20)\n",
"Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from transformers) (4.6.0)\n",
"Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /usr/local/lib/python3.7/dist-packages (from transformers) (0.10.3)\n",
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from transformers) (1.19.5)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2021.5.30)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (1.24.3)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (2.10)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->transformers) (3.0.4)\n",
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from huggingface-hub==0.0.12->transformers) (3.7.4.3)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.0.1)\n",
"Requirement already satisfied: click in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (7.1.2)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from sacremoses->transformers) (1.15.0)\n",
"Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging->transformers) (2.4.7)\n",
"Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers) (3.4.1)\n",
"Requirement already satisfied: jieba in /usr/local/lib/python3.7/dist-packages (0.42.1)\n",
"Requirement already satisfied: sentencepiece in /usr/local/lib/python3.7/dist-packages (0.1.96)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "VK1eE59CLMMX",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "01ce8094-d2e4-4f4f-df3a-73213265150e"
},
"source": [
"# from transformers import AutoTokenizer,AutoModelForCausalLM,pipeline\n",
"# tokenizer = AutoTokenizer.from_pretrained(\"hfl/chinese-xlnet-base\")\n",
"# model = AutoModelForCausalLM.from_pretrained(\"hfl/chinese-xlnet-base\")\n",
"# text_generator = pipeline('text-generation',model=model,tokenizer=tokenizer,device=0)\n",
"\n",
"from transformers import XLNetTokenizer, TFGPT2LMHeadModel\n",
"from transformers import TextGenerationPipeline\n",
"import jieba\n",
"# add spicel process \n",
"class XLNetTokenizer(XLNetTokenizer):\n",
" translator = str.maketrans(\" \\n\", \"\\u2582\\u2583\")\n",
" def _tokenize(self, text, *args, **kwargs):\n",
" text = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]\n",
" text = \" \".join(text)\n",
" return super()._tokenize(text, *args, **kwargs)\n",
" def _decode(self, *args, **kwargs):\n",
" text = super()._decode(*args, **kwargs)\n",
" text = text.replace(' ', '').replace('\\u2582', ' ').replace('\\u2583', '\\n')\n",
" return text\n",
"\n",
"tokenizer = XLNetTokenizer.from_pretrained('mymusise/CPM-Generate-distill')\n",
"model = TFGPT2LMHeadModel.from_pretrained(\"mymusise/CPM-Generate-distill\")\n",
"\n",
"text_generator = TextGenerationPipeline(model, tokenizer,device=0)"
],
"execution_count": 34,
"outputs": [
{
"output_type": "stream",
"text": [
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
"All model checkpoint layers were used when initializing TFGPT2LMHeadModel.\n",
"\n",
"All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at mymusise/CPM-Generate-distill.\n",
"If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "RBJchlIVLkGP"
},
"source": [
"# ents=['格力电器', '广汽集团', '农业银行', '工商银行', '建设银行', '中国银行', '五粮液', '中国石化',\n",
"# '中国石油', '中国石油股份', '中国海洋石油', '中石油', '中国石油化工股份', '中石化', '山西证券',\n",
"# '中信证券', '兴业证券', '华东医药', '可视化', '安信信托', '英伟达', '贵州茅台',\n",
"# '山西汾酒', '华电国际', '国投电力', '长源电力', '煤炭', '卫宁健康', '东华软件', '平安好医生',\n",
"# '宝塔实业', '润邦股份', '浙江广厦', '中国海防', 'ST中天', '兖州煤业', '淡水河谷', '必和必拓',\n",
"# '力拓', '奥维云网', '美的集团', '亿纬锂能', '国电南瑞', '申万宏源', '申万宏源香港',\n",
"# '上港集团', '恒力石化', '海尔智家', '云铝股份','德展健康']\n",
"# texts=[]\n",
"# for e in ents:\n",
"# texts.append(\"\"\"任务:写含有否定的正面句子。\n",
"# 第1句:“有传言公司的收入损失,但这不是真的。”\n",
"# 第2句:“谣传公司的盈利不好,不过事实并非如此。”\n",
"# 第3句:“道听途说净利润同比减41.92%,但它是假的。”\n",
"# 第4句:“传言公司收到深交所监管函,可是这是假的。”\n",
"# 第5句:“\"\"\")\n",
"\n",
"# for _ in range(n): #i'm creating positive sentiment sentences that contain negation\n",
"# texts.append(\"\"\"任务:写五个包含有否定的正面句。\n",
"# 第1句:“谣传公司净利润同比减41.92%,但这不是真的。”\n",
"# 第2句:“公司上诉并赢得不当解雇诉讼。”\n",
"# 第3句:“凭借出色的表现,公司证明分析师是错误的。”\n",
"# 第4句:“传言公司收到深交所监管函,可是这是假的。”\n",
"# 第5句:“\"\"\")\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "1bZniVM1yHwF",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "9840dab8-f0cd-458c-e099-684835021372"
},
"source": [
"n=500\n",
"\n",
"texts=[]\n",
"for _ in range(n): #i'm creating positive sentiment sentences that contain negation/something negative but the company overcame successfully. took from real news content that contains negation\n",
" texts.append(\"\"\"任务:写出相似情绪的正面句子。\n",
" 第1句:“海外疫情则始终得不到有效控制,在这么不利的外部环境下,公司捞取得现在的业绩,实属不易。”\n",
" 第2句:“公司成功推翻了法院的裁决,该裁决要求其退款给客户。”\n",
" 第3句:“\"\"\")\n",
"\n",
"headlines=[]\n",
"for text in texts:\n",
" token_len=len(tokenizer.tokenize(text))\n",
" #nucleus or top-p sampling: \n",
" #top_p=0.9,top_k=5 form Schtick 2021 for generating datsets. Remove k for diversity.https://arxiv.org/abs/2104.07540\n",
" hl=text_generator(text,max_length=token_len+40,top_p=0.9,use_cache=True,do_sample=True, return_text=True,return_full_text=False)[0]['generated_text'] \n",
" hl=hl.split('”')[0]\n",
" if len(set(hl))>10 and '公司' in hl: #simple rule to clean out noise: should have 10 unique characters at least and also contain company('公司') since all sample sentences have company inside.\n",
" print(hl)\n",
" headlines.append(hl)\n",
" # print(\"----------------------------------\\n\")\n",
"\n",
"headlineNEW=[x.split('\\n')[0] for x in headlines]\n",
"import pandas as pd\n",
"import re\n",
"df = pd.DataFrame(headlineNEW, columns=[\"headlineNEW\"])\n",
"# df['headlineNEW']=df['headlineNEW'].str.split('”',expand=True).iloc[:,0]\n",
"df['EngText']=df.headlineNEW.apply(lambda x: re.search('[a-zA-Z]', x)) #simple rule to clean out noise by removing text that contain english\n",
"df=df[df.EngText.isnull()]\n",
"df.to_csv('headlineNEW.csv', index=False)\n",
"print('DONE')\n",
"df['headlineNEW'].head(50)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"公司是一个充满风险的公司。\n",
"公司的员工在没有任何证据的情况下被拘留,这是无法避免的。\n",
"公司董事长将成为美国最受欢迎的CEO,而非中国的CEO。\n",
"公司—— 被宣布为‘ 国家的‘ 公司’。\n",
"公司从财务部辞职,辞职后的三个月,没有进行任何管理,反而开始了一些对公司不利的工作。\n",
"公司的核心价值之一,正是公司本身。\n",
"公司有责任用自己的行动去纠正错误。\n",
"公司与顾客之间的利益在逐渐破裂,公司需要重新审视自己的行为。\n",
"公司在这方面做得很成功。\n",
"董事会和公司决策层一致通过了新的法案,公司要用自己的企业去拯救这些可怜人们。\n",
"因为我要在市场上争取到更多的话语权,公司应该向我表达承诺。\n",
"在这里,公司必须为客户付出,并且为社会做出贡献。\n",
"公司取得的成绩都已经不重要了。\n",
"员工也许无法忍受,但不想离开,但更希望与政府沟通,让政府为公司考虑。\n",
"然而当时公司已经陷入僵局,身为上市公司的股东,我认为现在的问题是应该如何处理,而这种僵局\n",
"公司为了保持员工安全,通过了安保措施,为此赢得了很多的公众的关注。\n",
"公司在董事长的办公室里发表了声明,表示公司是有责任的。\n",
"公司做出了一系列错误决定,损害公司利益,公司的前途就要受到影响。\n",
"公司将在未来某一天,会从另一个公司的账户中收回这些证据。\n",
"公司在海外经营着许多项目。公司在全球范围内投资了超过3000万美元,但鉴于种种原因,公司现在无法再获得应有的收益了。\n",
"公司成功推翻了某公司的股东大会。\n",
"在这么不利的外部环境下,公司捞取得现在的业绩,实属不易。\n",
"公司仍然保持着原有的经营模式。\n",
"如果企业的主要责任落到了股东,那么该公司在这场危机之前,已经是‘受害者’了。\n",
"倘若公司能够做出一个正确的决定,业界的领导者会更加相信他。\n",
"公司通过积极开展‘走出非洲’计划,让非洲人民都可以走出非洲。\n",
"公司应该让媒体对公司产生负面影响。\n",
"现在就可以,这里的法律是由公司所建立的,所以公司将可以做到一个合法的社会。\n",
"公司应该把最有价值的东西交给最重要的人。\n",
"公司员工的行为,已造成了社会的恶劣影响。\n",
"公司在这个过程中取得的辉煌成就,是我们为之自豪的。\n",
"公司应该向政府反映,政府应该提供帮助。\n",
"公司现在的核心竞争力是人,而不是机器,因此公司的生存和发展并没有丝毫的悬念。\n",
"有人把本公司的一个文件给你看,那是由公司的一个股东签署的。\n",
"公司仍在坚持,并未停止推翻判决。\n",
"公司在最短时间内完成了起诉。\n",
"公司的战略性投资获得了良好的效果,对公司造成了巨大的损害,为公司作出了重要的贡献。\n",
"为了让您真正了解公司情况,我们对他进行了多次的电话回访。\n",
"这位女士对于公司的表现表示担忧,如果公司仍然继续发展下去,她会一直承受不利的环境,为自己的未来埋下隐患\n",
"公司的员工要做的是立刻把公司的财产归还给人家。\n",
"因为这家公司一直为某些政治目的而工作。\n",
"我们公司的业务是通过纳斯达克的上市交易来实现。\n",
"但是公司没有得到解雇。\n",
"我们在处理所有问题时不考虑对公司不利的因素。\n",
"一个没有任何法律法规支持的公司,必然会遇到许多问题,但我会通过本书来告诉你如何在关键时刻做正确的事。\n",
"公司成功接管了一家外国公司,公司成了全球第一个有经济实力的公司。\n",
"‘中国企业海外部经理’、‘中国国际航空公司的总经理’、‘世界之最’、‘中国之星’的CEO\n",
"我的客户和公司利益在美国界是很大的,如果美国政府不批准在这里工作,那么将很可能受到伤害。\n",
"在过去的岁月里,“利欲熏心的公司在公众心中已经是个罪犯。\n",
"公司在这场大战中,在这方面做得很成功。\n",
"公司是在这种恶劣的环境中生存下来的。\n",
"终于有个人知道公司为何会失败了。\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Ai-modJ4IvUm"
},
"source": [
"Try distilled CPM Generate. not expecting much - its the best"
]
},
{
"cell_type": "code",
"metadata": {
"id": "PtD8VLCtIsPe"
},
"source": [
"from transformers import XLNetTokenizer, TFGPT2LMHeadModel\n",
"from transformers import TextGenerationPipeline\n",
"import jieba\n",
"# add spicel process \n",
"class XLNetTokenizer(XLNetTokenizer):\n",
" translator = str.maketrans(\" \\n\", \"\\u2582\\u2583\")\n",
" def _tokenize(self, text, *args, **kwargs):\n",
" text = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]\n",
" text = \" \".join(text)\n",
" return super()._tokenize(text, *args, **kwargs)\n",
" def _decode(self, *args, **kwargs):\n",
" text = super()._decode(*args, **kwargs)\n",
" text = text.replace(' ', '').replace('\\u2582', ' ').replace('\\u2583', '\\n')\n",
" return text\n",
"\n",
"tokenizer = XLNetTokenizer.from_pretrained('mymusise/CPM-Generate-distill')\n",
"model = TFGPT2LMHeadModel.from_pretrained(\"mymusise/CPM-Generate-distill\")\n",
"\n",
"text_generator = TextGenerationPipeline(model, tokenizer,device=0)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ANgHmqeiIsMh",
"outputId": "edf5e1e2-38c9-4782-ab29-d58f33458960"
},
"source": [
"text_generator('任务:写出5个有相似情绪的句子。第1句:“', max_length=50, do_sample=True, top_p=0.9,top_k=5)"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'generated_text': '任务:写出5个有相似情绪的句子。第1句:“我不喜欢你,” 第2句:“我不喜欢你,”'}]"
]
},
"metadata": {
"tags": []
},
"execution_count": 21
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "QvtrpzFWIsI9",
"outputId": "de3899b4-79b7-4ced-a989-df8ef7da9ed4"
},
"source": [
"text_generater(\"天下熙熙,\", max_length=15, top_k=1, use_cache=True, prefix='')"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[{'generated_text': '天下熙熙,皆为利禄。 '}]"
]
},
"metadata": {
"tags": []
},
"execution_count": 6
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "za1slvrLIqFU"
},
"source": [
"too big "
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 83,
"referenced_widgets": [
"6567677206674378b6fc730cad81ab37",
"f3bbb128cf9547229ade5a6bad8dd824",
"619fc2f268c840eda52f1c5720ea383b",
"d57bd6ce7c21425c96f236acfc0c06f5",
"56f32b2190704f8087f416234b6f13b7",
"2c6cb3481d1240c9a2842c48afe423f9",
"8c0c021283af40d9a6514425796e450a",
"7106cb27acf24f0da05edeffb395cb9e"
]
},
"id": "yvMkZXnGFaN5",
"outputId": "78a89c9a-28de-4af1-bcd7-f2b3fe52a4b3"
},
"source": [
"from transformers import TextGenerationPipeline, AutoTokenizer, AutoModelForCausalLM\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"TsinghuaAI/CPM-Generate\")\n",
"model = AutoModelForCausalLM.from_pretrained(\"TsinghuaAI/CPM-Generate\")\n",
"\n",
"text_generator = TextGenerationPipeline(model, tokenizer,device=0)\n",
"text_generator('清华大学', max_length=50, do_sample=True, top_p=0.9)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
],
"name": "stderr"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6567677206674378b6fc730cad81ab37",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=10422021287.0, style=ProgressStyle(desc…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "-4fc6TxFFaHn"
},
"source": [
""
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Obh2zXIOFaDY"
},
"source": [
""
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "1OuGpkKODQuR"
},
"source": [
"CPM-GPT2 is not very good."
]
},
{
"cell_type": "code",
"metadata": {
"id": "PPAZJWsow0cQ",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 249,
"referenced_widgets": [
"e18747d9fe06419fbffdc21fe9821aeb",
"cd9c23449cb344e1a401b7eb1a4b4219",
"b8aad056d3614aed973c73cd696617b2",
"fca4aac93b164128a43b25fe8ca7a53e",
"01b9ad19d57a4b2abfa86001a9bfb411",
"a802fc38b087422ca4e83cf53eb13601",
"1c968bc1b5b544beab6cdb877993d36c",
"a98d174f6a564f9bb1c368617b037bb0",
"d9ba2fb5c8d540a4a72907721298e067",
"c2583447215e46848227d8a48472a917",
"c66cac0b642c49f69a66ed25b972634b",
"38945155fe594fd380474b78aa58dc90",
"2b1f1e8110394208b211a0841f7eb73c",
"98a3741919534591abafcc2460b5dd0a",
"9261b417cbf74af6989cd8bbae0694f7",
"c0d7730620a04f4abc614497b34024cd",
"8862ae9abd784646a4c5c5479c2dddeb",
"61c8852a7442462fac25403c4943ecb9",
"4b1874aa6b24472888444fb333c49d6b",
"2fa2f4ba8e5c4d09909f36e2eaf28a6a",
"08e0f274cbda4bec939303e38442901c",
"91eaa3d93d6440fc86895ec8b74ceff1",
"7946cd2135ab4ab885d78d386d2ac90b",
"faab398bff2b44e988f29c7c574092c8"
]
},
"outputId": "d5509736-04d1-4882-b56d-8c746c70bf8d"
},
"source": [
"from transformers import XLNetTokenizer, TFGPT2LMHeadModel\n",
"import jieba\n",
"from transformers import pipeline\n",
"# add spicel process \n",
"class XLNetTokenizer(XLNetTokenizer):\n",
" translator = str.maketrans(\" \\n\", \"\\u2582\\u2583\")\n",
"def _tokenize(self, text, *args, **kwargs):\n",
" text = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]\n",
" text = \" \".join(text)\n",
" return super()._tokenize(text, *args, **kwargs)\n",
"def _decode(self, *args, **kwargs):\n",
" text = super()._decode(*args, **kwargs)\n",
" text = text.replace(' ', '').replace('\\u2582', ' ').replace('\\u2583', '\\n')\n",
" return text\n",
"tokenizer = XLNetTokenizer.from_pretrained('mymusise/CPM-GPT2')\n",
"model = TFGPT2LMHeadModel.from_pretrained(\"mymusise/CPM-GPT2\")\n",
"text_generator = pipeline('text-generation',model=model,tokenizer=tokenizer,device=0)"
],
"execution_count": null,
"outputs": [
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "e18747d9fe06419fbffdc21fe9821aeb",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=713229.0, style=ProgressStyle(descripti…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
],
"name": "stderr"
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d9ba2fb5c8d540a4a72907721298e067",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=801.0, style=ProgressStyle(description_…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "display_data",
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "8862ae9abd784646a4c5c5479c2dddeb",
"version_minor": 0,
"version_major": 2
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, description='Downloading', max=10388738808.0, style=ProgressStyle(desc…"
]
},
"metadata": {
"tags": []
}
},
{
"output_type": "stream",
"text": [
"\n"
],
"name": "stdout"
},
{
"output_type": "stream",
"text": [
"All model checkpoint layers were used when initializing TFGPT2LMHeadModel.\n",
"\n",
"All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at mymusise/CPM-GPT2.\n",
"If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "-kh2A3PlxPIT"
},
"source": [
"generate positive negating headline"
]
},
{
"cell_type": "code",
"metadata": {
"id": "9_tQew-7xOru"
},
"source": [
"texts=[]\n",
"for e in ents:\n",
" texts.append(\"\"\"\n",
" 华油钴:求关系改善,华油钴供的业绩不再降低。\\n\n",
" 目标超市的崛起:每天开设超过13家门店,目标超市的收入不再降低。\\n\n",
" \"\"\"+e)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ni6A3aoykAYp",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 249
},
"outputId": "7d4ace07-ede1-470f-8178-48da706a0f59"
},
"source": [
"headlines=[]\n",
"for text in texts:\n",
" token_len=len(tokenizer._tokenize(text))\n",
" hl=text_generator(text,max_length=token_len+40,top_p=0.9,top_k=5,use_cache=True,do_sample=True,\n",
" return_text=True,return_full_text=False)[0]['generated_text'] #p=0.9/k=5 form Schtick 2021 for generating datsets. Remove k for diversity.https://arxiv.org/abs/2104.07540\n",
" headlines.append(hl)\n",
" print(hl)\n",
" print(\"-------\\n\")"
],
"execution_count": null,
"outputs": [
{
"output_type": "error",
"ename": "AttributeError",
"evalue": "ignored",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-13-9813f24da3b3>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mheadlines\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtexts\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mtoken_len\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_tokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m hl=text_generator(text,max_length=token_len+40,top_p=0.9,top_k=5,use_cache=True,do_sample=True,\n\u001b[1;32m 5\u001b[0m return_text=True,return_full_text=False)[0]['generated_text'] #p=0.9/k=5 form Schtick 2021 for generating datsets. Remove k for diversity.https://arxiv.org/abs/2104.07540\n",
"\u001b[0;31mAttributeError\u001b[0m: 'XLNetTokenizerFast' object has no attribute '_tokenize'"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "RKu6IyF5mGdE"
},
"source": [
"headlineNEW=[x.split('\\n')[0] for x in headlines]\n",
"import pandas as pd\n",
"df = pd.DataFrame(headlineNEW, columns=[\"headlineNEW\"])\n",
"df.to_csv('headlineNEW.csv', index=False)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "K8EZFxaMwmD2"
},
"source": [
"headlineNEW=pd.read_csv('headlineNEW.csv')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"id": "WaMKxg6cwrO1",
"outputId": "e4e82bcc-2d28-43c7-8058-aaf033dfd56a"
},
"source": [
"headlineNEW.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>headlineNEW</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>【 天风 拂晓 】 : “ 天风 ” , 中国 的 “ 风 ” , 吹拂 着 “ 格力 ”...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>中国 汽车行业 的 “ 黑马 ” : 广汇 汽车 正面新闻头条 : 广汇 汽车 的 “ 头...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>【 农业银行 】 农业银行 正面新闻头条 : 【 中国农业银行 】 公司 : ▂ 中国工商...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>【 工商银行 】 工商银行 : “ 中国 第一家 上市 银行 ” , 中国 第一家 股份制...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>【 建设银行 】 建设银行 : 积极支持 地方 政府 , 促进 地方 经济社会 发展 ▃ ...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" headlineNEW\n",
"0 【 天风 拂晓 】 : “ 天风 ” , 中国 的 “ 风 ” , 吹拂 着 “ 格力 ”...\n",
"1 中国 汽车行业 的 “ 黑马 ” : 广汇 汽车 正面新闻头条 : 广汇 汽车 的 “ 头...\n",
"2 【 农业银行 】 农业银行 正面新闻头条 : 【 中国农业银行 】 公司 : ▂ 中国工商...\n",
"3 【 工商银行 】 工商银行 : “ 中国 第一家 上市 银行 ” , 中国 第一家 股份制...\n",
"4 【 建设银行 】 建设银行 : 积极支持 地方 政府 , 促进 地方 经济社会 发展 ▃ ..."
]
},
"metadata": {
"tags": []
},
"execution_count": 8
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bg5gix33-zbI"
},
"source": [
"Create news content given newsheadline (inspired by https://arxiv.org/abs/2104.07540)"
]
},
{
"cell_type": "code",
"metadata": {
"id": "MuA8tYuy-bji"
},
"source": [
"import pandas as pd\n",
"headlineNEW=pd.read_csv('PosHeadline.csv')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "h8WqI7T7-gac",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "742cf6e7-7273-4eaf-855f-b023f581f6c5"
},
"source": [
"headlineNEW=headlineNEW['headlineNEW'].tolist()\n",
"headlineNEW"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[' 【 天风 拂晓 】 : “ 天风 ” , 中国 的 “ 风 ” , 吹拂 着 “ 格力 ” 公司 :',\n",
" ' 中国 汽车行业 的 “ 黑马 ” : 广汇 汽车 正面新闻头条 : 广汇 汽车 的 “ 头 ” 字号 ▃ 【 ▂ 正面',\n",
" ' 【 农业银行 】 农业银行 正面新闻头条 : 【 中国农业银行 】 公司 : ▂ 中国工商银行 正面新闻头条 : 【 中国工商银行',\n",
" ' 【 工商银行 】 工商银行 : “ 中国 第一家 上市 银行 ” , 中国 第一家 股份制 银行 , 中国 第一家 “ 银行 + 保险 ” 的',\n",
" ' 【 建设银行 】 建设银行 : 积极支持 地方 政府 , 促进 地方 经济社会 发展 ▃ 诗词 : 忆 昔同 舟共泛 , 今',\n",
" ' 中国银行 的 “ 头版 ” : “ 中国银行 ” 的 新 形象 , “ 中国银行 ” 的 新气象 公司 : 中国建设银行 正面',\n",
" ' 五粮液 : 中国 酒业 的 “ 金三角 ” 公司 : ▂ ▃ ▂ 公司简介 ▂ 公司 名称 : ▂ 华油 股份 有限公司',\n",
" ' 中国 石油 正面新闻头条 : 中国 石化 正面新闻头条 : 中国 海洋 石油 正面新闻头条 : 中国 航空 油料 正面新闻头条 : 中国 中化 正面新闻',\n",
" ' 中国 石油勘探 开发 正面 报道 , “ 中国 石油 ” 品牌价值 达 3.98 亿美元 公司 : 中海油 正面新闻头条 : 中海油 “ 中',\n",
" ' 中国 石油 : 油价 下跌 , 中国石油集团公司 股价 大幅 下跌 公司 : 中化 化肥 正面新闻头条 : 中化 化肥 : 化肥 出口量 大幅 下降 公司',\n",
" ' 中国 石油 公司 正面新闻 : 中国 海洋 石油 公司 正面新闻 : 中国 海洋 石油 公司 正面 新闻 : 中国海洋石油公司 正面 新闻 : 中国海洋石油总公司',\n",
" ' “ 中石油 ” 的 新面貌 : 跻身 世界 石油 公司 行列 公司 : 道 达尔 正面新闻头条 : 道 达尔 , 世界 第二 大',\n",
" ' 中石化 的 “ 三高 ” : 高 增长 、 高 利润 、 高 技术 公司 : 中国 石油 天然气 股份 正面新闻头条 : 中国 石油 天然气 股份公司',\n",
" ' 【 中化 石油 】 中化 石油 , 中国 石化 的 新起点 公司 : 施耐德 电气 正面新闻头条 : 【 施耐德 电气',\n",
" ' 《 证券时报 》 : 中国 股市 : “ 中国 的 拉斯维加斯 ” ▃ 诗词 : 忆 昨 ,',\n",
" ' 中信证券 股份 有限公司 的 成立 : “ 中信 ” 品牌 “ 出征 ” 华尔街 , “ 中信 ” 品牌 在 美国 上市公司 : 力拓',\n",
" ' 【 “ 两会 ” : 中国 经济 的 风向标 公司 : 中国 石化 正面新闻头条 : 中国石化集团 正面新闻头条 : “ 力',\n",
" ' 【 中国 制药 工业协会 】 “ 中国 制药 工业协会 ” 成立 于 1989 年 , 是 我国 医药行业 唯一 的 国家一级 行业协会 。 协会 成立 以来',\n",
" ' 《 中国 国家 地理 》 杂志 : 中国 人 的 衣食住行 公司 : 华龙 一号 正面新闻头条 : 《 中国 国家 地理 》 杂志 : 中国 人',\n",
" ' “ 安信 ” 在 中国 的 业务 节节 攀升 , 业绩 节节 攀升 公司 : 惠而浦 正面新闻头条 : “ 惠',\n",
" ' “ 业界 良心 ” 公司业绩 节节 攀升 , 股价 却 在 下滑 公司业绩 节节 攀升 , 股价 却 在 下滑 公司业',\n",
" ' “ 贵州 茅台 ” 的 崛起 : 每瓶 酒 的 利润 超过 10 万元 人民币 公司 : 上海大众 正面新闻头条 : 中国 汽车 市场 的 “',\n",
" ' 【 山西汾酒 】 山西汾酒 的 成功之路 公司 : 【 中国 人寿 】 中国 人寿保险 公司 的 新 形象 : “ 中国 人寿保险 股份 有限公司',\n",
" ' 中国 的 “ 耐克 ” 公司 : 中国 的 “ 麦当劳 ” 公司 : 中国 的 “ 家乐福 ” 公司 : 中国 的 “ 李宁 ” 公司',\n",
" ' 国投 电力公司 : “ 国投 ” 品牌 在 世界 范围 内 , “ 国投 ” 品牌 在 中国 市场 的 占有率 达 50% 公司 : 中国建设',\n",
" ' 【 长源 】 的 “ 家 ” 是 怎样 炼成 的 公司 : 【 美的 】 的 “ 美 ” 字 : “ 家',\n",
" ' “ 煤老板 ” 的 春天 到来 了 公司 : “ 雀巢 ” 的 春天 来 了 公司 : “ 百事 ” 的 春天 来 了 公司 :',\n",
" ' 《 卫宁 》 杂志 创刊 , 健康 产业 迈向 新台阶 公司 : 美邦 正面新闻头条 : 美邦 , 让 国人 更 了解 自己 的',\n",
" ' 东华 软件 : “ ” 的 一声 , 中国 纺织工业 的 “ 金字招牌 ” 落 了 地 公司 : 中国 石化 正面新闻头条 :',\n",
" ' 平安保险 : 业界 领先 的 医疗保险 公司 公司 : 中国 平安保险 公司 正面新闻头条 : 中国 平安 人寿保险 股份 有限公司 正面新闻头条 : 中国 平安',\n",
" ' 【 】 公司 在 上海 开设 了 第一家 海外 零售店 , 【 】 在 中国 内地 开设 了 第一家 海外 零售店 , 【',\n",
" ' 中国 最大 的 服装 零售 公司 润邦 股份 : “ 润 ” 在 中国 润邦 股份 : 中国 最大 的 服装 零售 公司 润邦 股份 : “ 邦 ”',\n",
" ' 【 浙江广厦 】 浙江广厦 : “ 家 ” 的 感觉 真 好 公司 : ▂ “ 中国 制造 ” ▂ 的 新起点',\n",
" ' “ 中国海事 服务网 ” 开通 了 网上支付 服务 , 方便 客户 网上支付 公司 : “ 中国 — 东盟自由贸易区 ” 正式 成立',\n",
" ' ST 的 “ 头牌 ” 是 “ 头 ” , “ 王牌 ” 是 “ 王牌 ” , 头牌 、 王牌 、 王牌',\n",
" ' 兖煤 集团 的 发展 : 从无到有 , 从小到大 公司 : 利丰 集团 正面新闻头条 : 利丰 集团 的 发展 : 从无到',\n",
" ' 淡水河谷 公司 成立 于 1869 年 , 是 全球 第二 大 的 服装 公司 。 公司总部 设在 美国 , 在 全球 30 多个 国家 设有 销售 机构',\n",
" ' “ 必 和 必拓 ” — — 全球 最 大规模 的 石油 和 天然气 开采 企业 之一 , 其 产量 、 销售额 、 利润 和 市场占有率 均 位居 世界',\n",
" ' 力拓 公司 成立 ▂ 力拓 : 力拓 的 新 发展 ▂ 力拓 : 力拓 的 新 目标 ▂ 力拓 :',\n",
" ' 【 天风 证券 : “ 钴 ” 价 上升 带动 “ 钒 ” 价 】 天风 证券 : 钒 价上扬 带动',\n",
" ' 【 “ 美的 ” 】 , 中国家电 制造 第一 品牌 公司 : “ 家乐福 ” , 全球 最大 零售商 公司 , 年销售额',\n",
" ' “ 锂 ” 字头 公司 的 新面孔 : 锂业 “ 三驾 马车 ” 公司业绩 增长 , 利润 增加 , “ 三驾 马车',\n",
" ' “ 国电 南瑞 ” 的 成功之路 公司 : “ 中国 电力 投资 有限责任 公司 ” 正面新闻 : 中国 电力 投资 有限公司 的 成立 , 标志 着',\n",
" ' 中国 有色金属 工业协会 、 中国金属学会 、 中国金属学会 有色金属 行业 分会 、 中国 有色金属 学会 ▃ 、 中国金属学会 ▃ 、',\n",
" ' 【 申万宏源香港 】 申万宏源香港 的 发展 历程 : 【 申万宏源香港 】 申万宏源香港 的',\n",
" ' 中国 远洋 : “ 近水楼台先得月 ” , 中国 远洋 的 快速 发展 带动 了 上 港 的 发展 , 上港 的 发展 又 带动 了 中国 远洋',\n",
" ' 【 天风有色杨诚笑团队 】 恒力石化 : “ 有色金属 王国 ” 的 兴起 公司 : 中化 国际 正面新闻头条',\n",
" ' “ 中国 第一 ” 的 “ 海尔 ” “ 世界 第一 ” 的 “ 联想 ” “ 全球 第一 ” 的 “ 华为 ” “ 世界 一流',\n",
" ' “ 云铝 ” 成为 “ 中国 名牌 ” 公司 : 力帆 正面新闻头条 : 力帆 “ 力帆 ” 品牌 跻身',\n",
" ' 德展 : “ 中国 第一 , 世界 第三 ” 的 目标 已经 实现 , “ 健康 、 快乐 、 时尚 、 健康 ” 已 成为 中国 人 生活']"
]
},
"metadata": {
"tags": []
},
"execution_count": 2
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "MO3Pmwhe-kfJ",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "8187611c-25ae-412f-da85-04f09ded41b1"
},
"source": [
"texts=[]\n",
"for e,h in zip(ents,headlineNEW):\n",
" texts.append(\"\"\"公司: 华友钴业\n",
" 正面新闻头条:【天风有色杨诚笑团队】华油钴:钴价上涨带动了业绩,供求关系改善,业绩有望回升\n",
" 正面新闻内容:事项:\"公司发布2019年半年报,实现营收91.04亿元,同增34.21%;归母净利润0.33亿元,同减97.81%,扣非归母净利润-0.41亿元,同减102.74%,其中,计提资产减值3.26亿元。\n",
" 同时,公司拟通过华友国际矿业收购华友控股香港持有的40%股权,交易金额1026.548万美元,维斯通持有玮达贝能源90%的股权,后者拟在印尼投资建设1×250MW火力发电项目,用于印尼红土镍矿的冶炼。\n",
" 钴价下跌带来吨利润下滑以及资产减值损失,业绩大幅下滑2019年上半年,MB低级钴均价17.18美元/磅,同减58.17%,环减49.16%;中国四钴均价20.44万元/吨,同减55.43%,环减41.20%;硫酸钴均价5.39万元/吨,同减59.08%,环减41.22%。\n",
" 报告期内,公司生产钴产品12,645吨金属量,同增17.79%;销售钴产品12,829吨金属量,同增37.16%,因钴价大幅上涨,吨利润下滑叠加资产减值损失3.26亿元,公司业绩大幅下滑。\" \n",
" 公司: 拉夏贝尔\n",
" 正面新闻头条: 中文版ZARA的崛起:每天开设超过13家门店,半年利润近5亿美元\n",
" 正面新闻内容: 中国版ZARA\"的一场硬仗:每天关店超13家,上半年亏损近5亿为了登陆A股,拉夏贝尔曾在6年内发起三次\"冲击\"。现在,这家成功登陆港股和A股两地市场的服装企业陷入又一场\"战役\"。\n",
" 8月28日晚间,拉夏贝尔的半年报正式公布:营收下滑9.78%至39.51亿元,净利润下滑311.2%至-4.98亿元。对于曾拥有过\"高光时刻\"的拉夏贝尔来说,这样的业绩同其已有的品牌影响力难以匹配。\n",
" 一时间,关于拉夏贝尔策略有误导致发展\"失速\"的言论四起。\"2019年作为公司第三个10年的起点,公司正在经历大规模的战略收缩。\"\n",
" 公司: \"\"\"+e+\"\"\"\n",
" 正面新闻头条:\"\"\"+h+\"\"\"\n",
" 正面新闻内容:\"\"\")\n",
"content=[]\n",
"for text in texts:\n",
" token_len=len(tokenizer._tokenize(text))\n",
" c=text_generator(text,max_length=token_len+100,top_p=0.9,use_cache=True,do_sample=True,return_text=True,return_full_text=False)[0]['generated_text'] #,temperature=0.7 #0.9/k=5 follows Schtick 2021 for generating datsets. Remove top k if want diversity\n",
" content.append(c)\n",
" print(c)\n",
"import pandas as pd\n",
"df = pd.DataFrame(content, columns=[\"content\"])\n",
"df.to_csv('content.csv', index=False)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
" “ 今年 ( 2002 年 ) 亏损 1.34 亿元 , 比 上年 同期 下降 38.82% 。 2001 年度 生产量 同比 下降 31.5% , 2002 年 上半年 产量 降幅 达到 15.8% 。 ... ... 今年 ( 2002 年 ) 上半年 格力 冰箱 压缩机 产量 达 15 亿台 , 同比 增长 34.4% ; 冰箱 产量 达 14 亿台 , 同比 增长 32.3% ... ... 公司 正在 积聚 力量\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "LeTEs8Pm-kb_"
},
"source": [
""
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "NuFMd42a29rH"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment