All Data and Code

All Data You can find here , CLICK

We use `langchain` to interact with OpenAI API and use 'dotenv' to manage our API_KEY. The core code is following,

``` Python

from langchain.chat_models import ChatOpenAI

from langchain.schema import (

SystemMessage

)

import os

import json

import time

from dotenv import load_dotenv

from global_logger import Log

import os

import tiktoken

load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

logger = Log.get_logger()

chat = ChatOpenAI(temperature=0, max_tokens=500)

def num_tokens_from_messages(value, model="gpt-3.5-turbo-0301"):

"""Returns the number of tokens used by a list of messages."""

try:

encoding = tiktoken.encoding_for_model(model)

except KeyError:

encoding = tiktoken.get_encoding("cl100k_base")

if model == "gpt-3.5-turbo-0301": # note: future models may deviate from this

num_tokens = 0

num_tokens += len(encoding.encode(value))

return num_tokens

else:

raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")

import json

def chat_gpt_answer(data_tests, output_dir ):

for k in data_tests:

data_eggs = data_tests[k]

output_dir_path = f"{output_dir}/{k}"

os.makedirs(os.path.join(f"{output_dir}/{k}"), exist_ok=True)

for i, question in enumerate(data_eggs):

#os.makedirs(os.path.join(f"{output_dir_path}/{i}"), exist_ok=True)

outputfile_answer = os.path.join(f"{output_dir_path}/{i}_answer.md")

outputfile_question = os.path.join(f"{output_dir_path}/{i}_question.md")

messages = [

SystemMessage(content=question),

]

try:

response = chat(messages)

logger.info(response)

answers = response.content

time.sleep(3)

except Exception as e:

logger.info(e)

continue

with open(outputfile_answer, "w") as f:

f.write( answers )

with open(outputfile_question, "w") as f1:

f1.write( question )

```

We interact with local StarChat/CodeLlama. The core code is following,

``` Python

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

import torch

import os

def load_plm(model_name):

tokenizer = AutoTokenizer.from_pretrained(f'huggingface/hub/{model_name}', trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(f'huggingface/hub/{model_name}', trust_remote_code=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

model.to(device)

return tokenizer, model

tokenizer, model = load_plm('CodeLlama-13b-Instruct-hf')

# tokenizer, model = load_plm('starchat-alpha')

def chat_with_starchat(question):

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

prompt_template = "<|system|>\n<|end|>\n<|user|>\n{query}<|end|>\n<|assistant|>"

prompt = prompt_template.format(query=question)

outputs = pipe(prompt, max_new_tokens=256, no_repeat_ngram_size=2)

return outputs[0]["generated_text"]

def chat_with_codellama(question):

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

prompt_template = " <s>[INST]{query} [/INST]"

prompt = prompt_template.format(query=question)

outputs = pipe(prompt, max_new_tokens=256, no_repeat_ngram_size=2)

return outputs[0]["generated_text"]

def get_answer(base_path, output_file):

language_path = ["C/code", "java-basics/code", "python/code", "solidity/code"]

for language in language_path:

path = base_path+language

for f in sorted(os.listdir(path)):

fpath = os.path.join(path, f)

lan = language.split('/')[0]

question = open(fpath, "r").read()

os.makedirs(f"{base_path}/{lan}/{f}", exist_ok=True)

answer = chat_with_codellama(question)

# answer = chat_with_starchat(question)

with open(f"{base_path}/{lan}/{f}/{output_file}", "w") as out:

out.write(answer)

```

Page updated

Google Sites

Report abuse