Get from http://kin-y.github.io/miningReviewRepo/
python3 GetFileList.py gm_openstack user passwd
mkdir revision_files
python3 RequestFileDiff.py gm_openstack https://review.openstack.org start end --from-iniGet from http://kin-y.github.io/miningReviewRepo/
python3 GetFileList.py gm_openstack user passwd
mkdir revision_files
python3 RequestFileDiff.py gm_openstack https://review.openstack.org start end --from-ini| #!/usr/bin/env python3 | |
| """ | |
| Get file list from mysql | |
| Usage: | |
| $ python3 src/GetFileList.py gm_openstack user passwd | |
| Output: | |
| ./gm_openstack.csv | |
| - "ch_id": Change id | |
| - "rev_id": Revision id | |
| - "f_file_name": Encoded file path | |
| """ | |
| import sys | |
| import csv | |
| from urllib.parse import quote_plus | |
| from collections import defaultdict | |
| import MySQLdb | |
| def main(): | |
| """ | |
| Main | |
| """ | |
| # set argument | |
| argv = sys.argv | |
| argc = len(argv) | |
| if argc == 4: | |
| current_db = argv[1] | |
| user = argv[2] | |
| passwd = argv[3] | |
| else: | |
| current_db = "gm_openstack" | |
| user = "root" | |
| passwd = "" | |
| # Define dictionary | |
| t_revision_dic = defaultdict(lambda: []) | |
| t_file_dic = defaultdict(lambda: []) | |
| # Connect DB | |
| connection = MySQLdb.connect(db=current_db, user=user, passwd=passwd) | |
| cursor = connection.cursor() | |
| # Get changes | |
| sys.stdout.write("\rCollecting changes...") | |
| sql = "SELECT id, ch_Id, ch_changeId \ | |
| FROM t_change" | |
| cursor.execute(sql) | |
| changes = cursor.fetchall() | |
| # Get revisions | |
| sys.stdout.write("\rCollecting revisions...") | |
| sql = "SELECT id, rev_Id, rev_changeId, rev_patchSetNum \ | |
| FROM t_revision" | |
| cursor.execute(sql) | |
| revisions = cursor.fetchall() | |
| # Get files | |
| sys.stdout.write("\rCollecting files...") | |
| sql = "SELECT f_fileName, f_revisionId \ | |
| FROM t_file" | |
| cursor.execute(sql) | |
| files = cursor.fetchall() | |
| # Close DB connection | |
| connection.close() | |
| # Store data into t_revisionDic | |
| for revision in revisions: | |
| t_revision_dic[revision[2]].append(revision) | |
| for rev_file in files: | |
| t_file_dic[int(rev_file[1])].append(rev_file) | |
| # File list for output | |
| output_files = [] | |
| # Search from changes | |
| changes_len = len(changes) | |
| for i, change in enumerate(changes): | |
| ch_revisions = t_revision_dic[change[0]] | |
| ch_id = change[1] | |
| ch_change_id = change[2] | |
| revisions_len = len(ch_revisions) | |
| # Search from revisions | |
| for j, revision in enumerate(ch_revisions): | |
| rev_files = t_file_dic[revision[0]] | |
| rev_id = revision[1] | |
| rev_change_id = revision[2] | |
| rev_patch_set_num = revision[3] | |
| output_files += [[ch_id, ch_change_id, | |
| rev_id, rev_change_id, | |
| quote_plus(rev_file[0]), rev_patch_set_num] | |
| for rev_file in rev_files] | |
| sys.stdout.write("\rChange: %d / %d, Revision: %d / %d" % | |
| (i, changes_len, j, revisions_len)) | |
| # Output | |
| with open(current_db + ".csv", 'w') as csvfile: | |
| writer = csv.writer(csvfile, lineterminator='\n') | |
| sys.stdout.write("\rOutputting files...") | |
| writer.writerow(["ch_id", "ch_change_id", | |
| "rev_id", "rev_change_id", | |
| "f_file_name", "rev_patchSetNum"]) | |
| writer.writerows(output_files) | |
| if __name__ == '__main__': | |
| main() |
| #!/usr/bin/env python3 | |
| """ | |
| Get file revised from csv | |
| """ | |
| from csv import DictReader | |
| from sys import argv, stdout | |
| from os import mkdir, path, error | |
| from time import sleep | |
| from requests import get, exceptions | |
| USAGE = "Usage: python3 src/RequestFileDiff.py current_db requests_header start end\ | |
| [--from-ini] [--from-prev]" | |
| FROM_BASE = 0 | |
| FROM_INI = 1 | |
| FROM_PREV = 2 | |
| def main(): | |
| """ | |
| Main | |
| """ | |
| base_mode = FROM_BASE | |
| if "--from-ini" in argv: | |
| base_mode = FROM_INI | |
| argv.remove("--from-ini") | |
| elif "--from-prev" in argv: | |
| base_mode = FROM_PREV | |
| argv.remove("--from-prev") | |
| if len(argv) != 5 or "-h" in argv or "--help" in argv: | |
| print(USAGE) | |
| return | |
| # Set argument | |
| current_db = argv[1] | |
| requests_header = argv[2] # exp) https://review.openstack.org | |
| start = int(argv[3]) | |
| end = int(argv[4]) | |
| # Make project's directory | |
| projects_path = "./revision_files/" + current_db | |
| if not path.exists(projects_path): | |
| mkdir(projects_path) | |
| with open(current_db + ".csv", 'r') as csvfile: | |
| reader = DictReader(csvfile, lineterminator='\n') | |
| for i, rev_file in enumerate(reader, start=1): | |
| if i >= start: | |
| break | |
| for i, rev_file in enumerate(reader, start=start): | |
| if i > end: | |
| break | |
| f_file_name = str(rev_file["f_file_name"]) | |
| rev_patch_set_num = str(rev_file["rev_patchSetNum"]) | |
| requests_url = "/".join([requests_header, | |
| "changes", str(rev_file["ch_id"]), | |
| "revisions", rev_patch_set_num, | |
| "files", f_file_name, | |
| "diff"]) | |
| params = make_param_from(int(rev_patch_set_num), base_mode) | |
| for _ in range(1, 5): | |
| try: | |
| response = get(requests_url, params=params) | |
| if response.status_code != 200: | |
| print("\n" + str(i) + ": " + requests_url + " "+ str(response.status_code)) | |
| if response.status_code == 404: | |
| break | |
| sleep(30) | |
| continue | |
| except exceptions.RequestException as err: | |
| print("\n" + str(i) + ": " + str(err)) | |
| sleep(30) | |
| else: | |
| break | |
| response.encoding = 'utf-8' | |
| # Output | |
| revisions_path = "/".join([projects_path, rev_file["rev_id"]]) | |
| if not path.exists(revisions_path): | |
| mkdir(revisions_path) | |
| try: | |
| with open("/".join([revisions_path, f_file_name + ".json"]), 'w') as rev_file: | |
| rev_file.write(response.text) | |
| except error: | |
| print("\nOS Error") | |
| continue | |
| stdout.write("\rFile: %d / %d" % (i, end)) | |
| def make_param_from(rev_patch_set_num, base_mode): | |
| """ | |
| Return requests parameter | |
| """ | |
| if rev_patch_set_num == 1 or base_mode == FROM_BASE: | |
| return None | |
| elif base_mode == FROM_INI: | |
| return {"base": "1"} | |
| elif base_mode == FROM_PREV: | |
| return {"base": str(rev_patch_set_num-1)} | |
| if __name__ == '__main__': | |
| main() |