akshaykarnawat · May 14, 2024 22:50
diff --git a/huggingface_model.py b/huggingface_model.py
 import os
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

 # set environment variable HUGGING_FACE_API_KEY=hf_************
 HUGGING_FACE_API_KEY = os.environ.get("HUGGING_FACE_API_KEY")

 model_id = "microsoft/Phi-3-mini-4k-instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_id, legacy=False)
 model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    device_map="auto",
    torch_dtype="auto", 
    trust_remote_code=True
 )

 pipeline = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
 )

 messages = [
    {"role": "user", "content": "What are competitors to Apache Kafka?"},
 ]

 output = pipeline(messages, **{
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.01,
    "do_sample": False,
 })

 print(output[0].get("generated_text"))

 ############
 ## OUTPUT ##
 ############
 # Apache Kafka is a popular distributed streaming platform used for building real-time data pipelines and streaming applications. While Kafka has a strong presence in the market, there are several competitors and alternative technologies that offer similar or complementary functionalities. Here are some of the notable competitors and alternatives to Apache Kafka:
 #
 # 1. Apache Pulsar:
 #   - Pulsar is an open-source publish-subscribe messaging system that can handle millions of messages per second. It is designed to be highly scalable, fault-tolerant, and low-latency. Pulsar supports multiple messaging protocols, including Kafka, and offers a more modern and flexible API.
 #
 # 2. Apache Samza:
 #   - Samza is a stream processing framework built on top of Apache Kafka. It provides a simple and easy-to-use API for building stateful stream processing applications. Samza integrates with Kafka for messaging and provides fault-tolerance, scalability, and low-latency processing.
 #
 # 3. Apache Beam:
 #   - Apache Beam is a unified model for defining both batch and streaming data processing pipelines. It provides a set of abstractions for building data processing pipelines that can run on various execution engines, including Apache Flink, Apache Spark, and Google Cloud Dataflow. Beam supports Kafka as a source and sink for data streams.
 #
 # 4. Apache Flink:
 #   - Apache Flink is a distributed stream processing framework that provides high-throughput, low-latency, and fault-tolerant processing of streaming data. Flink supports Kafka as a source and sink for data streams and offers a rich set of APIs for building complex stream processing applications.
 #
 # 5. Apache Storm:
 #   - Apache Storm is a distributed real-time computation system that can process large volumes of streaming data. It provides a simple and flexible API for building real-time applications and integrates with Kafka for messaging.
 #
 # 6. Apache Kudu:
 #   - Kudu is a columnar storage system designed for low-latency, high-throughput analytics. It can be used as a data source and sink for streaming data and integrates with Kafka for messaging.
 #
 # 7. Apache Druid:
 #   - Druid is an open-source column
 #
diff --git a/requirements.txt b/requirements.txt
 huggingface_hub
 transformers
 torch
 accelerate
	import os
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

	# set environment variable HUGGING_FACE_API_KEY=hf_************
	HUGGING_FACE_API_KEY = os.environ.get("HUGGING_FACE_API_KEY")

	model_id = "microsoft/Phi-3-mini-4k-instruct"
	tokenizer = AutoTokenizer.from_pretrained(model_id, legacy=False)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	device_map="auto",
	torch_dtype="auto",
	trust_remote_code=True
	)

	pipeline = pipeline(
	task="text-generation",
	model=model,
	tokenizer=tokenizer,
	)

	messages = [
	{"role": "user", "content": "What are competitors to Apache Kafka?"},
	]

	output = pipeline(messages, **{
	"max_new_tokens": 500,
	"return_full_text": False,
	"temperature": 0.01,
	"do_sample": False,
	})

	print(output[0].get("generated_text"))

	############
	## OUTPUT ##
	############
	# Apache Kafka is a popular distributed streaming platform used for building real-time data pipelines and streaming applications. While Kafka has a strong presence in the market, there are several competitors and alternative technologies that offer similar or complementary functionalities. Here are some of the notable competitors and alternatives to Apache Kafka:
	#
	# 1. Apache Pulsar:
	# - Pulsar is an open-source publish-subscribe messaging system that can handle millions of messages per second. It is designed to be highly scalable, fault-tolerant, and low-latency. Pulsar supports multiple messaging protocols, including Kafka, and offers a more modern and flexible API.
	#
	# 2. Apache Samza:
	# - Samza is a stream processing framework built on top of Apache Kafka. It provides a simple and easy-to-use API for building stateful stream processing applications. Samza integrates with Kafka for messaging and provides fault-tolerance, scalability, and low-latency processing.
	#
	# 3. Apache Beam:
	# - Apache Beam is a unified model for defining both batch and streaming data processing pipelines. It provides a set of abstractions for building data processing pipelines that can run on various execution engines, including Apache Flink, Apache Spark, and Google Cloud Dataflow. Beam supports Kafka as a source and sink for data streams.
	#
	# 4. Apache Flink:
	# - Apache Flink is a distributed stream processing framework that provides high-throughput, low-latency, and fault-tolerant processing of streaming data. Flink supports Kafka as a source and sink for data streams and offers a rich set of APIs for building complex stream processing applications.
	#
	# 5. Apache Storm:
	# - Apache Storm is a distributed real-time computation system that can process large volumes of streaming data. It provides a simple and flexible API for building real-time applications and integrates with Kafka for messaging.
	#
	# 6. Apache Kudu:
	# - Kudu is a columnar storage system designed for low-latency, high-throughput analytics. It can be used as a data source and sink for streaming data and integrates with Kafka for messaging.
	#
	# 7. Apache Druid:
	# - Druid is an open-source column
	#
No results found