Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- # -*- coding: UTF-8 -*-
- # vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79:
- import cPickle
- import hashlib
- import json
- import os
- import re
- import urllib2
- import time
- from BeautifulSoup import BeautifulSoup
- import feedparser
- STATE_FILE = 'state.pkl'
- FEED_SOURCE = 'http://www.ifanr.com/feed/user/pure'
- FEED_DEST = 'http://10010app.cn/OpenApi_test/content/'
- PROVIDER_KEY = '509b56941bbb5813fc2f7898'
- PROVIDER_ID = '24'
- PROVIDER_NAME = u'爱范儿'
- SECRET = '3'
- TYPE_ID = '1216'
- TYPE_NAME = u'科技'
- def sign_request():
- timestamp = time.strftime('%Y%m%d%H%M%S')
- pre_sign = ''.join([timestamp, PROVIDER_KEY, SECRET])
- signature_hash = hashlib.md5(pre_sign)
- signature = signature_hash.hexdigest()
- return {'tm': timestamp, 'code': signature, 'pKey': PROVIDER_KEY}
- def load_state():
- try:
- state_file = open(STATE_FILE, 'rb')
- except IOError:
- state_file = open(STATE_FILE, 'wb')
- cPickle.dump('-1', state_file)
- state_file.close()
- state_file = open(STATE_FILE, 'rb')
- state = cPickle.load(state_file)
- return state
- def save_state(state):
- state_file = open(STATE_FILE, 'wb')
- cPickle.dump(state, state_file)
- state_file.close()
- def content_processor(content_html):
- html_ = ''.join([c['value'] for c in content_html if 'value' in c])
- soup = BeautifulSoup(html_)
- images = [i['src'] for i in soup('img')]
- for idx, image in enumerate(images):
- soup.find('img', src=image).replaceWith('#[IMG#%s]#' % idx)
- full_text = ''.join([x.strip(' ') for x in soup(text=True)]).replace('#[',
- '<!--').replace(']#', '-->').replace('\n\n', '\n').replace('\n',
- '\\r\\n\n').split('\n')
- img_ret = [{'name': 'IMG#%d' % idx, 'value': img} for idx, img in enumerate(images)]
- if len(full_text[0]) < 500:
- content_header = full_text[0]
- content_left = '\n'.join(full_text[1:])
- else:
- content_header = full_text[0][:400]
- content_left = '\n'.join([full_text[0][400:], '\n'.join(full_text[1:])])
- contents = [{'name': 'content_all', 'value': '\n'.join(full_text)},
- {'name': 'content_header', 'value': content_header},
- {'name': 'content_left', 'value': content_left}]
- contents.extend(img_ret)
- return contents
- def entry_to_dict(entry):
- def tag_getter():
- if 'tags' in entry:
- return [y['term'] for y in entry.tags if 'term' in y]
- return []
- def postid():
- return re.search('\d+$', entry.link).group()
- ret = {}
- ret['contentName'] = entry.title
- ret['description'] = entry.description
- ret['tags'] = tag_getter()
- ret['ext'] = content_processor(entry.content)
- ret['providerPid'] = postid()
- return ret
- def upload(data):
- data['typeId'] = TYPE_ID
- data['typeName'] = TYPE_NAME
- data['providerId'] = PROVIDER_ID
- data['providerName'] = PROVIDER_NAME
- data['price'] = 0
- data['platform'] = 'all'
- final_data = {'data': data}
- sign = sign_request()
- for k in sign:
- final_data[k] = sign[k]
- print json.dumps(data)
- req = urllib2.Request(FEED_DEST, json.dumps(data), {'Content-Type': 'application/json'})
- u = urllib2.urlopen(req)
- print u.info()
- print u.read()
- pass
- def main():
- last_article = load_state()
- main_feed = feedparser.parse(FEED_SOURCE)
- current_latest = main_feed.entries[0]['guid']
- for entry in main_feed.entries:
- if entry['guid'] == last_article:
- break
- e = entry_to_dict(entry)
- upload(e)
- save_state(current_latest)
- print current_latest, last_article
- # iterate the main feed and send out all new feeds
- if __name__ == '__main__':
- os.environ['TZ'] = 'CST-08'
- time.tzset()
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement