From 8a573a4ee38bacce6e0e6979f5d9f8778154e3d7 Mon Sep 17 00:00:00 2001 From: Andreas Stephanides Date: Tue, 8 Aug 2017 13:14:51 +0200 Subject: [PATCH] deleted classifier in main dir --- classifier.py | 184 ----------------------------------------------- classify_mail.py | 25 ------- classify_text.py | 42 ----------- 3 files changed, 251 deletions(-) delete mode 100644 classifier.py delete mode 100644 classify_mail.py delete mode 100644 classify_text.py diff --git a/classifier.py b/classifier.py deleted file mode 100644 index 6abf414..0000000 --- a/classifier.py +++ /dev/null @@ -1,184 +0,0 @@ -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.feature_extraction import DictVectorizer -from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer -from sklearn.preprocessing import LabelEncoder -from sklearn.pipeline import Pipeline, FeatureUnion -from sklearn.naive_bayes import MultinomialNB -from sklearn.neural_network import MLPClassifier -from sklearn.model_selection import train_test_split -import numpy as np -import yaml -from storage import MailThread,db_session - -with open("data.yml", 'r') as stream: - try: - train=yaml.load(stream) - except yaml.YAMLError as exc: - print(exc) - -data_types= { "answered": bool, "maintopic": str} - -def store_training_data(i, d,key=b"answered"): - global train - if not data_types.has_key(key): - raise ValueError("Key "+str(key)+" unknown") - if not train.has_key(i): - train[i]={} - if not key is None and type(train[i]) is dict: - if not type(d) is data_types[key]: - # print str(type(d)) + " vs " + str(data_types[key]) - raise TypeError("Data - %s - for key "% d +str(key)+" must be " +str(data_types[key])+ " but it is "+ str(type(d))) - train[i][key]=d - - - with open("data.yml","w") as file: - file.write(yaml.dump(train,default_flow_style=True)) - file.close() - - -# Lade Trainingsdaten fuer einen angegebenen key (Label/Eigenschaft) -def get_training_threads(key="answered"): - t_a=[] - d_a=[] - d_a2=[] - for i in train: - t=db_session.query(MailThread).filter(MailThread.firstmail==i).first() - if not t is None: # Thread muss in der Datenbank sein - if train[i].has_key(key): # In den Trainingsdaten muss der relevante Key sein - t_a.append(t) - d_a.append(train[i][key]) - le=LabelEncoder() - d_a2=le.fit_transform(d_a) - return (t_a,d_a2,le) - - -def in_training(i, key="answered"): - return train.has_key(i) and train[i].has_key(key) - - -def print_answers(l): - cc=l.classes_ - c_id=l.transform(cc) - for i,c in enumerate(cc): - print str(i) + ": " + str(c) - return None - - -class ThreadDictExtractor(BaseEstimator, TransformerMixin): - def fit(self, x, y=None): - return self - def transform(self, X,y=None): - return [t.mail_flat_dict() for t in X] - -class ThreadSubjectExtractor(BaseEstimator, TransformerMixin): - def fit(self, x, y=None): - return self - def transform(self, X,y=None): - return [t.subject() for t in X] - -class ThreadTextExtractor(BaseEstimator, TransformerMixin): - def fit(self, x, y=None): - return self - def transform(self, X,y=None): - return [t.text() for t in X] - - - - - -def build_pipe(p=b"pipe1"): - - if p == "pipe1": - p=Pipeline([('tde', ThreadDictExtractor()), - ('dv',DictVectorizer()), - ('clf', MultinomialNB()) - ]) - elif p=="pipe2": - p = Pipeline([ - ('union', FeatureUnion(transformer_list=[ - ('subject', Pipeline([('tse', ThreadSubjectExtractor()), - ('cv',CountVectorizer()), - ('tfidf', TfidfTransformer()) - ])), - ('text', Pipeline([('tte',ThreadTextExtractor()), - ('cv',CountVectorizer()), - ('tfidf', TfidfTransformer()) - ])), - ('envelope', Pipeline([('tde', ThreadDictExtractor()), - ('dv',DictVectorizer()) - ])) - ], transformer_weights={ - 'subject': 1, - 'text': 0.7, - 'envelope': 0.7 - } )), - ('clf', MultinomialNB()) - ]) - elif p=="pipe2b": - p = Pipeline([ - ('union', FeatureUnion(transformer_list=[ - ('subject', Pipeline([('tse', ThreadSubjectExtractor()), - ('cv',CountVectorizer()), - ('tfidf', TfidfTransformer()) - ])), - ('text', Pipeline([('tte',ThreadTextExtractor()), - ('cv',CountVectorizer()), - ('tfidf', TfidfTransformer()) - ])), - ('envelope', Pipeline([('tde', ThreadDictExtractor()), - ('dv',DictVectorizer()) - ])) - ], transformer_weights={ - 'subject': 1, - 'text': 0.7, - 'envelope': 0.7 - } )), - ('mlc', MLPClassifier()) - ]) - elif p=="pipe2c": - p = Pipeline([ - ('union', FeatureUnion(transformer_list=[ - ('subject', Pipeline([('tse', ThreadSubjectExtractor()), - ('cv',CountVectorizer()), - ('tfidf', TfidfTransformer()) - ])), - ('text', Pipeline([('tte',ThreadTextExtractor()), - ('cv',CountVectorizer()), - ('tfidf', TfidfTransformer()) - ])), - ('envelope', Pipeline([('tde', ThreadDictExtractor()), - ('dv',DictVectorizer()) - ])) - ], transformer_weights={ - 'subject': 1, - 'text': 1, - 'envelope': 0.4 - } )), - ('mlc', MLPClassifier()) - ]) - else: - raise ValueError("The pipe %s is not a valid pipe") - return p - -def get_pipe(p=b"pipe1",k=b"answered"): - p=build_pipe(p) - tt= get_training_threads(k) - p.fit(tt[0],tt[1]) - return p,tt[2] - -from sklearn.metrics import accuracy_score - -def test_pipe(pp,k): - tt= get_training_threads(k) - X_train,X_test,y_train,y_test=train_test_split(tt[0],tt[1],test_size=0.2) - if type(pp) is list: - for p in pp: - print "pipe: %s" % p - p=build_pipe(p) - p.fit(X_train,y_train) - ypred=p.predict(X_test) - print accuracy_score(y_test,ypred) - -#pipe1=get_pipe("pipe1", "answered") -#pipe2=get_pipe("pipe2", "maintopic") -#pipe2b=get_pipe("pipe2b", "maintopic") diff --git a/classify_mail.py b/classify_mail.py deleted file mode 100644 index 00a87a7..0000000 --- a/classify_mail.py +++ /dev/null @@ -1,25 +0,0 @@ -from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer -from sklearn.feature_extraction import DictVectorizer -from sklearn.naive_bayes import MultinomialNB -from sklearn.pipeline import Pipeline, FeatureUnion -import sys -import yaml -from sklearn.preprocessing import OneHotEncoder -from sklearn.preprocessing import LabelEncoder - - -text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB())]) - -text_ohc = Pipeline([('ohc', OneHotEncoder()),('clf', MultinomialNB())]) - -combined_features = FeatureUnion([('vect1', CountVectorizer()),('vect2', CountVectorizer())]) - - -enc=OneHotEncoder() -with open("example_1.yaml", 'r') as stream: - try: - train=yaml.safe_load(stream) - except yaml.YAMLError as exc: - print(exc) - -tc=text_clf.fit(train["data"],train["target"]) diff --git a/classify_text.py b/classify_text.py deleted file mode 100644 index f95aac4..0000000 --- a/classify_text.py +++ /dev/null @@ -1,42 +0,0 @@ -from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer -from sklearn.naive_bayes import MultinomialNB -from sklearn.pipeline import Pipeline -text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB())]) -import sys -import yaml - - - -with open("example_1.yaml", 'r') as stream: - try: - train=yaml.safe_load(stream) - except yaml.YAMLError as exc: - print(exc) - -tc=text_clf.fit(train["data"],train["target"]) -print(sys.argv[1]) - -answ=(tc.predict([sys.argv[1]]))[0] -print train["target_names"][answ] - -for i in range(0, (len(train["target_names"]))): - print (str(i)+" "+ train["target_names"][i]) - -ca=int(raw_input("Correct answer..")) - - -if ca == answ: - print ("Yes I got it right") -else: - print("should I remember this?") - a=raw_input("shoudIrememberthis?") - if a == "y": - train["data"].append(sys.argv[1]) - train["target"].append(ca) - print yaml.dump(train,default_flow_style=False) - file=open("example_1.yaml","w") - file.write(yaml.dump(train,default_flow_style=False)) - file.close() - else: - print ("Ok, I already forgot") -