div. updates
This commit is contained in:
92
__init__.py
92
__init__.py
@@ -1,40 +1,12 @@
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import lockfile
|
||||||
|
#from lockfile import LockFile
|
||||||
package_directory = os.path.dirname(os.path.abspath(__file__))
|
package_directory = os.path.dirname(os.path.abspath(__file__))
|
||||||
from config import Config
|
from config import Config
|
||||||
|
|
||||||
cfg = Config(file(os.path.join(package_directory, 'config.cfg')))
|
cfg = Config(file(os.path.join(package_directory, 'config.cfg')))
|
||||||
#--------------- Logging
|
#--------------- Logging
|
||||||
import logging
|
|
||||||
|
|
||||||
file_handler=logging.FileHandler(cfg.logfile)
|
|
||||||
file_handler.setLevel(logging.DEBUG)
|
|
||||||
stream_handler=logging.StreamHandler(sys.stdout)
|
|
||||||
stream_handler.setLevel(logging.DEBUG)
|
|
||||||
|
|
||||||
clt=logging.getLogger('mylogger')
|
|
||||||
clt.setLevel(logging.DEBUG)
|
|
||||||
clt.addHandler(file_handler)
|
|
||||||
clt.addHandler(stream_handler)
|
|
||||||
|
|
||||||
clogger=clt
|
|
||||||
#----------------
|
|
||||||
download_path=cfg.download_path
|
|
||||||
|
|
||||||
lg=clt
|
|
||||||
|
|
||||||
from gevent import spawn, monkey
|
|
||||||
monkey.patch_all()
|
|
||||||
from .compiler import start_workers
|
|
||||||
#start_workers(1,1,1)
|
|
||||||
|
|
||||||
|
|
||||||
if cfg.bot_active:
|
|
||||||
from src.bot import bot
|
|
||||||
#if not app.debug or os.environ.get("WERKZEUG_RUN_MAIN") == "true":
|
|
||||||
bot.message_loop()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Framework
|
# Framework
|
||||||
@@ -43,16 +15,66 @@ from flask import Flask, jsonify, render_template, redirect, request,send_from_d
|
|||||||
from flask_cors import CORS, cross_origin
|
from flask_cors import CORS, cross_origin
|
||||||
#Authentication
|
#Authentication
|
||||||
from flask_jwt import JWT, jwt_required, current_identity
|
from flask_jwt import JWT, jwt_required, current_identity
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
file_handler=logging.FileHandler(cfg.logfile)
|
||||||
|
file_handler.setLevel(logging.DEBUG)
|
||||||
|
stream_handler=logging.StreamHandler(sys.stdout)
|
||||||
|
stream_handler.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
CORS(app)
|
||||||
|
clt=logging.getLogger('mylogger')
|
||||||
|
clt.setLevel(logging.DEBUG)
|
||||||
|
clt.addHandler(file_handler)
|
||||||
|
clt.addHandler(stream_handler)
|
||||||
|
lg=clt
|
||||||
|
|
||||||
|
#clogger=clt
|
||||||
|
#----------------
|
||||||
|
|
||||||
|
#app.config['LOGGER_NAME']='mylogger'
|
||||||
|
app.logger.setLevel(logging.DEBUG)
|
||||||
|
app.logger.info("Server Started")
|
||||||
|
app.logger.setLevel(logging.DEBUG)
|
||||||
|
app.logger.addHandler(file_handler)
|
||||||
|
app.logger.addHandler(stream_handler)
|
||||||
|
|
||||||
|
clogger=app.logger
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
download_path=cfg.download_path
|
||||||
|
|
||||||
|
|
||||||
from src.models import Article,Section
|
from src.models import Article,Section
|
||||||
from src.users import authenticate, identity
|
from src.users import authenticate, identity
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
app = Flask(__name__)
|
|
||||||
CORS(app)
|
from gevent import spawn, monkey
|
||||||
app.config['LOGGER_NAME']='mylogger'
|
monkey.patch_all()
|
||||||
app.logger.setLevel(logging.DEBUG)
|
from .compiler import start_workers
|
||||||
app.logger.info("Server Started")
|
#start_workers(1,1,1)
|
||||||
|
|
||||||
|
|
||||||
|
lock = lockfile.LockFile("/srv/crawlerapi/bot.lock")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if cfg.bot_active and not lock.is_locked():
|
||||||
|
lock.acquire()
|
||||||
|
from src.bot import bot
|
||||||
|
#if not app.debug or os.environ.get("WERKZEUG_RUN_MAIN") == "true":
|
||||||
|
bot.message_loop()
|
||||||
|
except lockfile.Error:
|
||||||
|
clogger.info("Couldn't Lock the bot file")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
app.config['SECRET_KEY'] = 'super-secret'
|
app.config['SECRET_KEY'] = 'super-secret'
|
||||||
import flask
|
import flask
|
||||||
|
|||||||
@@ -59,6 +59,9 @@ class FullArticleSchema(Schema):
|
|||||||
author=fields.String(allow_none=True)
|
author=fields.String(allow_none=True)
|
||||||
section_id=fields.Integer()
|
section_id=fields.Integer()
|
||||||
sourcetype =fields.String()
|
sourcetype =fields.String()
|
||||||
|
organization_name=fields.String(dump_only=True)
|
||||||
|
organization_image=fields.String(dump_only=True)
|
||||||
|
organization_id=fields.Integer(dump_only=True)
|
||||||
image =fields.String(allow_none=True)
|
image =fields.String(allow_none=True)
|
||||||
# @post_load
|
# @post_load
|
||||||
# def make_article(self, data):
|
# def make_article(self, data):
|
||||||
@@ -95,8 +98,14 @@ class Article(Base):
|
|||||||
self.title=title
|
self.title=title
|
||||||
self.published_date=published_date
|
self.published_date=published_date
|
||||||
self.first_fetched=datetime.now()
|
self.first_fetched=datetime.now()
|
||||||
def __json__(self):
|
# def __json__(self):
|
||||||
return ArticleSchema().dump(self)[0]
|
# return ArticleSchema().dump(self)[0]
|
||||||
|
def organization_name(self):
|
||||||
|
return self.section.organization.name
|
||||||
|
def organization_image(self):
|
||||||
|
return self.section.organization.image
|
||||||
|
def organization_id(self):
|
||||||
|
return self.section.organization.id
|
||||||
|
|
||||||
# def dict(self):
|
# def dict(self):
|
||||||
# return {"id": str(int(self.id)), "title": self.title, "text": self.text, "author": self.author, "section":self.section, "sourcetype": self.sourcetype, "last_fetched": self.last_fetched, "first_fetched": self.first_fetched, "published_date": self.published_date, "date": self.date,"image": self.image, "url": self.url}
|
# return {"id": str(int(self.id)), "title": self.title, "text": self.text, "author": self.author, "section":self.section, "sourcetype": self.sourcetype, "last_fetched": self.last_fetched, "first_fetched": self.first_fetched, "published_date": self.published_date, "date": self.date,"image": self.image, "url": self.url}
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ def update(id):
|
|||||||
@article_pages.route("/<int:id>.json",methods=['GET'])
|
@article_pages.route("/<int:id>.json",methods=['GET'])
|
||||||
def get(id):
|
def get(id):
|
||||||
article=Article.query.get(id)
|
article=Article.query.get(id)
|
||||||
clogger.info(article)
|
# clogger.info(article)
|
||||||
# article=ArticleSchema().dump(article)[0]
|
# article=ArticleSchema().dump(article)[0]
|
||||||
return jsonify(article=article)
|
return jsonify(article=article)
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import urlparse
|
|||||||
from src.fb import graph
|
from src.fb import graph
|
||||||
from facebook import GraphAPIError
|
from facebook import GraphAPIError
|
||||||
import json
|
import json
|
||||||
|
import gevent
|
||||||
|
|
||||||
def fbfeedelement(h):
|
def fbfeedelement(h):
|
||||||
art={}
|
art={}
|
||||||
|
|||||||
@@ -45,4 +45,5 @@ def fscharticle(url,raw,params={}):
|
|||||||
d["published"]=parse(pi["published"])
|
d["published"]=parse(pi["published"])
|
||||||
d["pi"]=pi
|
d["pi"]=pi
|
||||||
d["sourcetype"]="fscharticle"
|
d["sourcetype"]="fscharticle"
|
||||||
|
d["section"]= "Fachschaft Chemie"
|
||||||
return {"article": d}
|
return {"article": d}
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
from requests import session
|
from requests import session
|
||||||
s=session()
|
s=session()
|
||||||
from src import package_directory, download_path,cfg
|
from src import package_directory, download_path,cfg, clogger
|
||||||
from os import path, makedirs
|
from os import path, makedirs
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
@@ -12,6 +12,7 @@ from src.database import db_session2
|
|||||||
from models import CrawlUrl
|
from models import CrawlUrl
|
||||||
import errno
|
import errno
|
||||||
import urlparse
|
import urlparse
|
||||||
|
from sqlalchemy.exc import OperationalError, InvalidRequestError
|
||||||
def announce_articleid(id):
|
def announce_articleid(id):
|
||||||
for u in cfg.announcearticle_url:
|
for u in cfg.announcearticle_url:
|
||||||
s.get( u % id)
|
s.get( u % id)
|
||||||
@@ -34,23 +35,35 @@ from models import CrawlCache
|
|||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_cache():
|
||||||
|
current_time = datetime.utcnow()
|
||||||
|
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days*2)
|
||||||
|
CrawlCache.query.filter(CrawlCache.fetched<ten_weeks_ago).delete()
|
||||||
|
|
||||||
|
def get_cached_page(furl):
|
||||||
|
current_time = datetime.utcnow()
|
||||||
|
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
|
||||||
|
try:
|
||||||
|
cc= CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||||
|
except OperationalError:
|
||||||
|
db_session2.rollback()
|
||||||
|
cc=None
|
||||||
|
return cc
|
||||||
|
|
||||||
|
|
||||||
def fetch_page(furl):
|
def fetch_page(furl):
|
||||||
current_time = datetime.utcnow()
|
|
||||||
ten_weeks_ago = current_time - timedelta(days=cfg.cache_days)
|
|
||||||
u=urlparse.urlparse(furl)
|
u=urlparse.urlparse(furl)
|
||||||
|
current_time = datetime.utcnow()
|
||||||
cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()
|
cu=CrawlUrl.query.filter(CrawlUrl.url==furl).first()
|
||||||
|
|
||||||
if u[0] == '':
|
if u[0] == '':
|
||||||
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
|
furl=urlparse.urlunsplit(("http",u[1],u[2],u[3],u[4]))
|
||||||
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||||
if cc is None or u[0]=='fb': # no caching for Facebook
|
if cc is None or u[0]=='fb': # no caching for Facebook
|
||||||
clogger.debug("fetching url: "+ str(furl))
|
clogger.debug("fetching url: "+ str(furl))
|
||||||
if u[0]=='fb':
|
if u[0]=='fb':
|
||||||
fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
|
fb_time_since = str(int((current_time - timedelta(days=10)-datetime(1970,1,1)).total_seconds()))
|
||||||
furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
|
furl=u[1]+u[2]+"?since="+fb_time_since+"&fields=story,created_time,id,message,attachments"
|
||||||
cc=CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
cc=get_cached_page(furl) #CrawlCache.query.filter(CrawlCache.url==furl).filter(CrawlCache.fetched>ten_weeks_ago).first()
|
||||||
if cc is None:
|
if cc is None:
|
||||||
tx = json.dumps(graph.get_object(id=furl))
|
tx = json.dumps(graph.get_object(id=furl))
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -27,19 +27,23 @@ class CrawlUrl(Base2):
|
|||||||
tpe=Column(String(250))
|
tpe=Column(String(250))
|
||||||
url = Column(String(250))
|
url = Column(String(250))
|
||||||
last_fetched = Column(DateTime)
|
last_fetched = Column(DateTime)
|
||||||
|
__schema__=CrawlUrlSchema
|
||||||
|
__jsonid__='crawlurl'
|
||||||
|
__whiteattrs__=["id","tpe","url"]
|
||||||
|
__jsonattrs__=None
|
||||||
def fetched(self):
|
def fetched(self):
|
||||||
CrawlCache.query.find(CrawlCache.url==self.url).first()
|
CrawlCache.query.find(CrawlCache.url==self.url).first()
|
||||||
@classmethod
|
@classmethod
|
||||||
def find_or_create(self, tpe, url):
|
def find_or_create(self, tpe, url):
|
||||||
aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first()
|
aa = CrawlUrl.query.filter(CrawlUrl.url==url).filter(CrawlUrl.tpe==tpe).first()
|
||||||
if aa is None:
|
if aa is None:
|
||||||
aa=CrawlUrl(tpe,url)
|
aa=CrawlUrl({"tpe":tpe,"url": url})
|
||||||
return aa
|
return aa
|
||||||
def schedule(self):
|
def schedule(self):
|
||||||
put_fetch_queue((0, self.tpe, self.url))
|
put_fetch_queue((0, self.tpe, self.url))
|
||||||
def __init__(self, tpe, url):
|
# def __init__(self, tpe, url):
|
||||||
self.url=url
|
# self.url=url
|
||||||
self.tpe=tpe
|
# self.tpe=tpe
|
||||||
def __json__(self):
|
def __json__(self):
|
||||||
return CrawlUrlSchema().dump(self)[0]
|
return CrawlUrlSchema().dump(self)[0]
|
||||||
|
|
||||||
@@ -55,7 +59,10 @@ class CrawlCache(Base2):
|
|||||||
url=Column(String(250))
|
url=Column(String(250))
|
||||||
fetched=Column(DateTime)
|
fetched=Column(DateTime)
|
||||||
raw=Column(Text)
|
raw=Column(Text)
|
||||||
|
__schema__=CrawlCacheSchema
|
||||||
|
__jsonattrs__=None
|
||||||
|
__jsonid__='crawlcache'
|
||||||
|
__whiteattrs__= []
|
||||||
def __init__(self, url,rw):
|
def __init__(self, url,rw):
|
||||||
self.url=url
|
self.url=url
|
||||||
self.raw=rw
|
self.raw=rw
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from src.database import db_session
|
|||||||
from mqueues import fetch_queue, compile_queue, put_fetch_queue
|
from mqueues import fetch_queue, compile_queue, put_fetch_queue
|
||||||
from fetching import fetch_page, downloadfile, announce_articleid
|
from fetching import fetch_page, downloadfile, announce_articleid
|
||||||
from fixing import fix_html, fix_file
|
from fixing import fix_html, fix_file
|
||||||
|
#from src import app
|
||||||
|
|
||||||
from compiler import article_types
|
from compiler import article_types
|
||||||
from fixing import fix_link
|
from fixing import fix_link
|
||||||
@@ -28,12 +29,19 @@ def process_article(art):
|
|||||||
aa.last_fetched=datetime.now()
|
aa.last_fetched=datetime.now()
|
||||||
aa.sourcetype=art["sourcetype"]
|
aa.sourcetype=art["sourcetype"]
|
||||||
db_session.add(aa)
|
db_session.add(aa)
|
||||||
|
try:
|
||||||
db_session.commit()
|
db_session.commit()
|
||||||
|
except InvalidRequestError,e:
|
||||||
|
db_session.rollback()
|
||||||
|
clogger.error(e)
|
||||||
clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
clogger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
||||||
db_session.close()
|
return aa
|
||||||
|
# app.logger.info("Updated/Added Article "+ str(aa.id) + ": " + (aa.title.encode("utf-8")))
|
||||||
|
# db_session.close()
|
||||||
# announce_articleid(aa.id)
|
# announce_articleid(aa.id)
|
||||||
#
|
#
|
||||||
return aa
|
|
||||||
|
|
||||||
|
|
||||||
# process a single found url
|
# process a single found url
|
||||||
def process_url(url,tpe, parent_url,params={}):
|
def process_url(url,tpe, parent_url,params={}):
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ def run_fetch():
|
|||||||
tc, tpe, url, p= fetch_queue.get()
|
tc, tpe, url, p= fetch_queue.get()
|
||||||
except ValueError:
|
except ValueError:
|
||||||
tc, tpe, url= fetch_queue.get()
|
tc, tpe, url= fetch_queue.get()
|
||||||
|
clogger.debug("fetched : "+url)
|
||||||
if tpe is not "dummyarticle" and tpe is not "dummyindex":
|
if tpe is not "dummyarticle" and tpe is not "dummyindex":
|
||||||
rw=fetch_page(url)
|
rw=fetch_page(url)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -79,8 +79,8 @@ def urls_lst():
|
|||||||
def urls_json(id):
|
def urls_json(id):
|
||||||
# Lade Alle Urls
|
# Lade Alle Urls
|
||||||
status=CrawlUrl.query.get(id)
|
status=CrawlUrl.query.get(id)
|
||||||
cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
|
# cc=CrawlCache.query.filter(CrawlCache.url==status.url).first()
|
||||||
return jsonify(urls=status, cache=cc.__json__())
|
return jsonify(urls=status)
|
||||||
|
|
||||||
# que an existing CrawlUrl for fetching
|
# que an existing CrawlUrl for fetching
|
||||||
@compiler_pages.route("/urls/<int:id>/que")
|
@compiler_pages.route("/urls/<int:id>/que")
|
||||||
|
|||||||
@@ -33,13 +33,16 @@ db_session = scoped_session(sessionmaker(autocommit=False,
|
|||||||
# autoflush=False,
|
# autoflush=False,
|
||||||
bind=engine))
|
bind=engine))
|
||||||
|
|
||||||
|
if cfg.get("db_urls_type") == "mysql":
|
||||||
|
engine2 = create_engine("mysql+pymysql://%s:%s@localhost/crawler_urls?charset=utf8" % (cfg.get("db_urls_user"), cfg.get("db_urls_pw")))
|
||||||
|
else:
|
||||||
|
engine2 = create_engine('sqlite:///'+ path.join(db_path,db_urlfile), convert_unicode=True)
|
||||||
|
|
||||||
engine2 = create_engine('sqlite:///'+ path.join(db_path,db_urlfile), convert_unicode=True)
|
|
||||||
|
|
||||||
db_session2 = scoped_session(sessionmaker(autocommit=False,
|
db_session2 = scoped_session(sessionmaker(autocommit=False,
|
||||||
autoflush=False,
|
autoflush=False,
|
||||||
bind=engine2))
|
bind=engine2))
|
||||||
from database_mbase import MyBase
|
from database_mbase import MyBase,MyBase2
|
||||||
|
|
||||||
|
|
||||||
#Base = declarative_base()
|
#Base = declarative_base()
|
||||||
@@ -47,7 +50,7 @@ from database_mbase import MyBase
|
|||||||
Base=declarative_base(cls=MyBase)
|
Base=declarative_base(cls=MyBase)
|
||||||
Base.query = db_session.query_property()
|
Base.query = db_session.query_property()
|
||||||
|
|
||||||
Base2 = declarative_base()
|
Base2 = declarative_base(cls=MyBase2)
|
||||||
Base2.query = db_session2.query_property()
|
Base2.query = db_session2.query_property()
|
||||||
|
|
||||||
def read_json(rq):
|
def read_json(rq):
|
||||||
|
|||||||
@@ -33,3 +33,35 @@ class MyBase(object):
|
|||||||
setattr(a, c.key,data[c.key])
|
setattr(a, c.key,data[c.key])
|
||||||
return a
|
return a
|
||||||
|
|
||||||
|
class MyBase2(object):
|
||||||
|
id = Column(Integer, primary_key=True)
|
||||||
|
# created_at = Column(TIMESTAMP, default=datetime.utcnow, nullable=False)
|
||||||
|
# updated_at = Column(TIMESTAMP, default=datetime.utcnow, onupdate=datetime.utcnow, nullable=False)
|
||||||
|
def __json__(self):
|
||||||
|
if self.__jsonattrs__ is None:
|
||||||
|
return self.__schema__().dump(self)[0]
|
||||||
|
else:
|
||||||
|
return self.__schema__(only=self.__jsonattrs__).dump(self)[0]
|
||||||
|
# def __init__(self, data={}):
|
||||||
|
# self.update(data,False)
|
||||||
|
|
||||||
|
def update(self,data, partial=True):
|
||||||
|
data, errors=self.__schema__( only=self.__whiteattrs__).load(data, partial=partial)
|
||||||
|
if len(errors)>0:
|
||||||
|
clogger.error(errors)
|
||||||
|
return (False,errors)
|
||||||
|
else:
|
||||||
|
for a in self.__whiteattrs__:
|
||||||
|
if data.has_key(a):
|
||||||
|
setattr(self,a,data[a])
|
||||||
|
return (True, [])
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def deserialize(cls,data):
|
||||||
|
data, errors=cls.__schema__().load(data,partial=True)
|
||||||
|
a=cls()
|
||||||
|
for c in cls.__table__.columns:
|
||||||
|
if data.has_key(c.key):
|
||||||
|
setattr(a, c.key,data[c.key])
|
||||||
|
return a
|
||||||
|
|
||||||
|
|||||||
20
dump_urls.py
Normal file
20
dump_urls.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
from src.compiler.models import CrawlCache, CrawlCacheSchema
|
||||||
|
from src.compiler.models import CrawlUrl, CrawlUrlSchema
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
|
||||||
|
if len(sys.argv) <= 1:
|
||||||
|
raise Error("Kein Zieldateiname angegeben")
|
||||||
|
|
||||||
|
def dump_crawlurl(a):
|
||||||
|
return CrawlUrlSchema().dump(a)
|
||||||
|
|
||||||
|
def dump_crawlcache(a):
|
||||||
|
return CrawlCacheSchema().dump(a)
|
||||||
|
|
||||||
|
file = open(sys.argv[1], "w+")
|
||||||
|
data={}
|
||||||
|
data["crawlurls"] = map(dump_crawlurl,CrawlUrl.query.all())
|
||||||
|
#data["crawlcache"] = map(dump_crawlcache,CrawlCache.query.all())
|
||||||
|
json.dump (data, file)
|
||||||
|
file.close()
|
||||||
44
load_urls.py
Normal file
44
load_urls.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
from src.compiler.models import CrawlCache, CrawlCacheSchema
|
||||||
|
from src.compiler.models import CrawlUrl, CrawlUrlSchema
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
from src.database import db_session2
|
||||||
|
from sqlalchemy.exc import IntegrityError
|
||||||
|
|
||||||
|
if len(sys.argv) <= 1:
|
||||||
|
raise Error("Kein Zieldateiname angegeben")
|
||||||
|
|
||||||
|
|
||||||
|
def insert_array(array, cls, session):
|
||||||
|
for s in array:
|
||||||
|
if not isinstance(s,cls):
|
||||||
|
print type(s)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
session.add(s)
|
||||||
|
session.commit()
|
||||||
|
except IntegrityError:
|
||||||
|
session.rollback()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def load_crawlurl(a):
|
||||||
|
print a
|
||||||
|
return CrawlUrl.deserialize(a[0])
|
||||||
|
def load_crawlcache(a):
|
||||||
|
return CrawlCache.deserialize(a[0])
|
||||||
|
|
||||||
|
|
||||||
|
file = open(sys.argv[1], "r")
|
||||||
|
data=json.load(file)
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
if data.has_key("crawlurls"):
|
||||||
|
crawlurls=data["crawlurls"]
|
||||||
|
crawlurls = map (load_crawlurl, crawlurls)
|
||||||
|
insert_array(crawlurls, CrawlUrl, db_session2)
|
||||||
|
|
||||||
|
if data.has_key("crawlcache"):
|
||||||
|
crawlcache=data["crawlcache"]
|
||||||
|
crawlcache = map (load_crawlcache, crawlcache)
|
||||||
|
insert_array(crawlcache, CrawlCache, db_session2)
|
||||||
Reference in New Issue
Block a user