Advertisement
cheungtifan

Untitled

Nov 26th, 2012
394
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.70 KB | None | 0 0
  1. #!/usr/bin/python
  2. # -*- coding: UTF-8 -*-
  3. # vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79:
  4.  
  5. import cPickle
  6. import hashlib
  7. import json
  8. import os
  9. import re
  10. import urllib2
  11. import time
  12. from BeautifulSoup import BeautifulSoup
  13. import feedparser
  14.  
  15. STATE_FILE = 'state.pkl'
  16. FEED_SOURCE = 'http://www.ifanr.com/feed/user/pure'
  17. FEED_DEST = 'http://10010app.cn/OpenApi_test/content/'
  18. PROVIDER_KEY = '509b56941bbb5813fc2f7898'
  19. PROVIDER_ID = '24'
  20. PROVIDER_NAME = u'爱范儿'
  21. SECRET = '3'
  22. TYPE_ID = '1216'
  23. TYPE_NAME = u'科技'
  24.  
  25.  
  26. def sign_request():
  27. timestamp = time.strftime('%Y%m%d%H%M%S')
  28. pre_sign = ''.join([timestamp, PROVIDER_KEY, SECRET])
  29. signature_hash = hashlib.md5(pre_sign)
  30. signature = signature_hash.hexdigest()
  31. return {'tm': timestamp, 'code': signature, 'pKey': PROVIDER_KEY}
  32.  
  33. def load_state():
  34. try:
  35. state_file = open(STATE_FILE, 'rb')
  36. except IOError:
  37. state_file = open(STATE_FILE, 'wb')
  38. cPickle.dump('-1', state_file)
  39. state_file.close()
  40. state_file = open(STATE_FILE, 'rb')
  41.  
  42. state = cPickle.load(state_file)
  43. return state
  44.  
  45.  
  46. def save_state(state):
  47. state_file = open(STATE_FILE, 'wb')
  48. cPickle.dump(state, state_file)
  49. state_file.close()
  50.  
  51.  
  52. def content_processor(content_html):
  53. html_ = ''.join([c['value'] for c in content_html if 'value' in c])
  54. soup = BeautifulSoup(html_)
  55. images = [i['src'] for i in soup('img')]
  56. for idx, image in enumerate(images):
  57. soup.find('img', src=image).replaceWith('#[IMG#%s]#' % idx)
  58. full_text = ''.join([x.strip(' ') for x in soup(text=True)]).replace('#[',
  59. '<!--').replace(']#', '-->').replace('\n\n', '\n').replace('\n',
  60. '\\r\\n\n').split('\n')
  61. img_ret = [{'name': 'IMG#%d' % idx, 'value': img} for idx, img in enumerate(images)]
  62. if len(full_text[0]) < 500:
  63. content_header = full_text[0]
  64. content_left = '\n'.join(full_text[1:])
  65. else:
  66. content_header = full_text[0][:400]
  67. content_left = '\n'.join([full_text[0][400:], '\n'.join(full_text[1:])])
  68. contents = [{'name': 'content_all', 'value': '\n'.join(full_text)},
  69. {'name': 'content_header', 'value': content_header},
  70. {'name': 'content_left', 'value': content_left}]
  71. contents.extend(img_ret)
  72. return contents
  73.  
  74.  
  75. def entry_to_dict(entry):
  76. def tag_getter():
  77. if 'tags' in entry:
  78. return [y['term'] for y in entry.tags if 'term' in y]
  79. return []
  80.  
  81. def postid():
  82. return re.search('\d+$', entry.link).group()
  83.  
  84. ret = {}
  85. ret['contentName'] = entry.title
  86. ret['description'] = entry.description
  87. ret['tags'] = tag_getter()
  88. ret['ext'] = content_processor(entry.content)
  89. ret['providerPid'] = postid()
  90. return ret
  91.  
  92.  
  93. def upload(data):
  94. data['typeId'] = TYPE_ID
  95. data['typeName'] = TYPE_NAME
  96. data['providerId'] = PROVIDER_ID
  97. data['providerName'] = PROVIDER_NAME
  98. data['price'] = 0
  99. data['platform'] = 'all'
  100.  
  101. final_data = {'data': data}
  102. sign = sign_request()
  103. for k in sign:
  104. final_data[k] = sign[k]
  105. print json.dumps(data)
  106. req = urllib2.Request(FEED_DEST, json.dumps(data), {'Content-Type': 'application/json'})
  107. u = urllib2.urlopen(req)
  108. print u.info()
  109. print u.read()
  110. pass
  111.  
  112.  
  113. def main():
  114. last_article = load_state()
  115. main_feed = feedparser.parse(FEED_SOURCE)
  116. current_latest = main_feed.entries[0]['guid']
  117. for entry in main_feed.entries:
  118. if entry['guid'] == last_article:
  119. break
  120. e = entry_to_dict(entry)
  121. upload(e)
  122. save_state(current_latest)
  123. print current_latest, last_article
  124. # iterate the main feed and send out all new feeds
  125.  
  126.  
  127. if __name__ == '__main__':
  128. os.environ['TZ'] = 'CST-08'
  129. time.tzset()
  130. main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement