From 607d71552cef9f2154b09ebece776e018ee044e7 Mon Sep 17 00:00:00 2001 From: Sushant Date: Fri, 24 Jun 2022 13:42:05 +0530 Subject: [PATCH] feat(model): Add log_reg agent --- README.md | 3 + atarashi/agents/logisticRegression.py | 89 +++++++++++++++++++++++++++ atarashi/atarashii.py | 7 ++- atarashi/build_deps.py | 13 ++-- atarashi/evaluator/evaluator.py | 6 +- pyproject.toml | 3 +- requirements.txt | 1 + setup.py | 6 +- 8 files changed, 113 insertions(+), 15 deletions(-) create mode 100644 atarashi/agents/logisticRegression.py diff --git a/README.md b/README.md index 8ed5f854..916a37e2 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,9 @@ Get the help by running `atarashi -h` or `atarashi --help` - Running **wordFrequencySimilarity** agent `atarashi -a wordFrequencySimilarity /path/to/file.c` +- Running **logisticRegression** agent + + `atarashi -a logisticRegression /path/to/file.c` - Running **tfidf** agent - With **Cosine similarity** diff --git a/atarashi/agents/logisticRegression.py b/atarashi/agents/logisticRegression.py new file mode 100644 index 00000000..1f94fe0d --- /dev/null +++ b/atarashi/agents/logisticRegression.py @@ -0,0 +1,89 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +""" +Copyright 2022 Sushant Kumar (sushantmishra02102002@gmail.com) +SPDX-License-Identifier: GPL-2.0 +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +version 2 as published by the Free Software Foundation. +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. +You should have received a copy of the GNU General Public License along +with this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +""" + +__author__ = 'Sushant Kumar' +__email__ = 'sushantmishra02102002@gmail.com' + +import argparse + +from atarashi.agents.atarashiAgent import AtarashiAgent +from atarashi.libs.initialmatch import spdx_identifer +from logreg import logreg + + +class LogisticRegression(AtarashiAgent): + + def __init__(self, licenseList): + super().__init__(licenseList) + + def predict_shortname(self, processed_comment): + ''' + :param filePath: extracted and preprocessed comment + :return: Returns the predicted license's short name + ''' + + processed_comment = [processed_comment] + return logreg(processed_comment) + + def scan(self, filePath): + ''' + Read the content of filename, extract the comments and preprocess them. + Find the predicted short name for the preprocessed file. + + :param filePath: Path of the file to scan + :return: Returns the license's short name + ''' + + match = [] + + with open(filePath) as file: + raw_data = file.read() + + spdx_identifers = spdx_identifer(raw_data, + self.licenseList['shortname']) + if spdx_identifers: + match.extend(spdx_identifers) + else: + processed_comment = super().loadFile(filePath) + license_name = self.predict_shortname(processed_comment) + + match.append({ + 'shortname': str(license_name[0]), + 'sim_score': 1.0, + 'sim_type': 'logisticRegression', + 'description': '', + }) + return match + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('processedLicenseList', + help='Specify the processed license list file') + parser.add_argument('inputFile', + help='Specify the input file which needs to be scanned' + ) + + args = parser.parse_args() + + licenseList = args.processedLicenseList + filename = args.inputFile + + scanner = LogisticRegression(licenseList) + scanner.scan(filename) diff --git a/atarashi/atarashii.py b/atarashi/atarashii.py index e551ab3b..d0dbee8f 100644 --- a/atarashi/atarashii.py +++ b/atarashi/atarashii.py @@ -28,6 +28,7 @@ from atarashi.agents.dameruLevenDist import DameruLevenDist from atarashi.agents.tfidf import TFIDF from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity +from atarashi.agents.logisticRegression import LogisticRegression __author__ = "Aman Jain" __email__ = "amanjain5221@gmail.com" @@ -46,7 +47,6 @@ def atarashii_runner(inputFile, processedLicense, agent_name, :param ngramJsonLoc: Specify N-Gram Json File location :param verbose: Specify if verbose mode is on or not (Default is Off/ None) :return: Returns the array of JSON with scan results - +------------+-----------------------------------------------------------+ | shortname | Short name of the license | +------------+-----------------------------------------------------------+ @@ -78,6 +78,8 @@ def build_scanner_obj(processedLicense, agent_name, similarity="CosineSim", scanner = WordFrequencySimilarity(processedLicense) elif agent_name == "DLD": scanner = DameruLevenDist(processedLicense) + elif agent_name == "logisticRegression": + scanner = LogisticRegression(processedLicense) elif agent_name == "tfidf": scanner = TFIDF(processedLicense) if similarity == "CosineSim": @@ -128,7 +130,8 @@ def main(): parser.add_argument("-l", "--processedLicenseList", required=False, help="Specify the location of processed license list file") parser.add_argument("-a", "--agent_name", required=True, - choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'], + choices=['wordFrequencySimilarity', 'DLD', + 'tfidf', 'Ngram', 'logisticRegression'], help="Name of the agent that needs to be run") parser.add_argument("-s", "--similarity", required=False, default="CosineSim", choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"], diff --git a/atarashi/build_deps.py b/atarashi/build_deps.py index 170c165c..d2a1a484 100755 --- a/atarashi/build_deps.py +++ b/atarashi/build_deps.py @@ -19,6 +19,10 @@ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. """ +from atarashi.license.license_merger import license_merger +from atarashi.license.licensePreprocessor import LicensePreprocessor +from atarashi.license.licenseDownloader import LicenseDownloader +from atarashi.libs.ngram import createNgrams __author__ = "Gaurav Mishra" __email__ = "gmishx@gmail.com" @@ -27,11 +31,6 @@ import sys sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)) + '/../') -from atarashi.libs.ngram import createNgrams -from atarashi.license.licenseDownloader import LicenseDownloader -from atarashi.license.licensePreprocessor import LicensePreprocessor -from atarashi.license.license_merger import license_merger - """ Creates required files for Atarashi. @@ -40,7 +39,8 @@ The merged CSV is then processesed which is then used to create the Ngrams. """ -def download_dependencies(threads = os.cpu_count(), verbose = 0): + +def download_dependencies(threads=os.cpu_count(), verbose=0): currentDir = os.path.dirname(os.path.abspath(__file__)) licenseListCsv = currentDir + "/data/licenses/licenseList.csv" processedLicenseListCsv = currentDir + "/data/licenses/processedLicenses.csv" @@ -58,6 +58,7 @@ def download_dependencies(threads = os.cpu_count(), verbose = 0): print("** Generating Ngrams **") createNgrams(processedLicenseListCsv, ngramJsonLoc, threads, verbose) + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-t", "--threads", required = False, default = os.cpu_count(), diff --git a/atarashi/evaluator/evaluator.py b/atarashi/evaluator/evaluator.py index 64ecce66..3a28045d 100755 --- a/atarashi/evaluator/evaluator.py +++ b/atarashi/evaluator/evaluator.py @@ -56,7 +56,6 @@ def processFile(scan_input): ''' processFile function runs the agent command on the bash/terminal and gets the result for the given file - :param filepath: The path of the file to be scanned :param similarity: Similarity type of the agent :return: Returns 1 if the result found by agent is correct and otherwise returns false @@ -89,7 +88,6 @@ def evaluate(scanner): The Function runs the agent command on the bash/terminal and gets the result. The license name is then parsed from the result and matched with the actual name. Successful matched % is then returned as accuracy. - :param scanner: Scanner object prepared to run scans :return: Time elapsed in the evaluation & the accuracy :rtype: float, int @@ -118,7 +116,8 @@ def evaluate(scanner): defaultJSON = resource_filename("atarashi", "data/Ngram_keywords.json") parser = argparse.ArgumentParser() parser.add_argument("-a", "--agent_name", required=True, - choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'], + choices=['wordFrequencySimilarity', 'DLD', + 'tfidf', 'Ngram', 'logisticRegression'], help="Name of the agent that needs to be run") parser.add_argument("-s", "--similarity", required=False, default="CosineSim", choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"], @@ -156,4 +155,3 @@ def evaluate(scanner): print(' ' + '+' * 44) shutil.rmtree('TestFiles') - diff --git a/pyproject.toml b/pyproject.toml index 5160d313..8a897cf4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,5 +10,6 @@ requires = [ "textdistance>=3.0.3", "pyxDamerauLevenshtein>=1.5", "nirjas>=0.0.5", - "urllib3>=1.24.1" + "urllib3>=1.24.1", + "logreg>=0.1.0" ] diff --git a/requirements.txt b/requirements.txt index d77b15dd..a056ef85 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ textdistance>=3.0.3 setuptools>=39.2.0 nirjas>=0.0.5 urllib3>=1.24.1 +logreg>=0.1.0 diff --git a/setup.py b/setup.py index d468589e..f02d02c9 100755 --- a/setup.py +++ b/setup.py @@ -55,7 +55,8 @@ def read(fname): 'tqdm>=4.42.0', 'pandas>=0.23.1', 'urllib3>=1.24.1', - 'nirjas>=0.0.5' + 'nirjas>=0.0.5', + 'logreg>=0.1.0' ] requirements = [ @@ -68,7 +69,8 @@ def read(fname): 'textdistance>=3.0.3', 'pyxDamerauLevenshtein>=1.5', 'urllib3>=1.24.1', - 'nirjas>=0.0.5' + 'nirjas>=0.0.5', + 'logreg>=0.1.0' ] class BuildAtarashiDependencies(distutils.cmd.Command):