Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- // In 2020 or 2019, the structure of Twitter archives has been changed dramatically
- // In the new version, all media ressources (.jpg and .png images, and .mp4 videos) are archived as well.
- // This boosts the storage demand (my archive needs 1.8GB now, 95% of which are used for media)
- // and makes it unfit for online usage.
- // Also, the data model for the tweets and the layout for the storage has been changed.
- // For more details, see this document (German language)
- // https://docs.google.com/document/d/1_sfWUF5AJ43007tQ07sZu9pnUMYHuzIkX0nr-vleuWc/edit?usp=sharing
- // This Node Script maps the new format into the older format.
- // - It reads the file tweets.js which contains the text and metadata of all tweets
- // - It generates partitions for each month, named and with data format required by the old twitter archive
- // - Data for retweeted users will be extracted from the list of user mentions
- global.window = {YTD: { tweet: { } } }; // The (new) JSON structure that wraps the tweet data
- const fs = require('fs');
- const DIR = "test/";
- const TWEET_NAME_PREFIX = "Grailbird.data.tweets_";
- const MYSELF = {
- id: 192979771,
- id_str: "192979771",
- name: "Rüdiger Plantiko",
- profile_image_url_https: "https://pbs.twimg.com/profile_images/1178630979/plantiko_normal.jpg",
- protected: false,
- screen_name: "rplantiko",
- verified: false
- };
- var tweets = readAllTweets();
- var tweet_index = [];
- writeByMonth( tweets );
- writeIndex( );
- function readAllTweets() {
- require('../tweet.js');
- return window.YTD.tweet.part0.sort( byID );
- }
- function writeByMonth(tweets) {
- var year = 0, month = 0;
- tt = [];
- for (let to of tweets) {
- let t = to.tweet;
- let d = new Date( t.created_at );
- let m = d.getMonth( );
- let y = d.getFullYear( );
- if (y != year || ( ( y == year ) && ( m!= month )) ) {
- writeToFile(year,month,tt);
- year = y;
- month = m;
- tt = [];
- }
- tt.push( map_tweet(t) );
- }
- writeToFile( year,month,tt); // "don't forget the last"
- }
- function writeIndex() {
- tweet_index.sort( (a,b)=> {
- let dy = a.year - b. year;
- return - ( dy == 0 ? a.month - b.month : dy )} );
- fs.writeFile( DIR+"tweet_index.js","var tweet_index = "+JSON.stringify( tweet_index,null,2), 'utf8', function(){} );
- }
- function map_tweet(t) {
- var t1 = {
- source:t.source,
- id:t.id,
- id_str:t.id_str,
- created_at:t.created_at,
- text:t.full_text,
- user:t.user||MYSELF,
- entities:t.entities
- };
- // Simple solution for retweets:
- // If text starts with "RT @user:...",
- // extract user from user_mentions,
- // if it can be found there
- var m = t1.text.match(/^RT @([^:]*):/);
- if (m) {
- let user = t1.entities.user_mentions.find( u=>u.screen_name == m[1] );
- if (user) {
- t1.user = user;
- // Coerce type to number:
- // (in the new format, the indices are given as strings)
- t1.user.indices = t1.user.indices.map( x => x*1 );
- }
- }
- return t1;
- }
- function writeToFile( year, month, tt ) {
- if (year < 2010) return;
- var month2 = ('0'+(month+1)).substr(-2);
- var monthString = `${ year }_${ month2 }`;
- var json = TWEET_NAME_PREFIX+monthString+" = \n"+JSON.stringify(tt);
- fs.writeFile( DIR+monthString+'.js', json, 'utf8', (err) => {
- if (err) throw err;
- } );
- tweet_index.push({
- file_name:"data\\/js\\/tweets\\/"+monthString+".js",
- var_name:"tweets_"+monthString,
- year:year,
- month:month+1,
- tweet_count:tt.length
- } );
- }
- function byID(a,b) {
- let x = BigInt(a.tweet.id) - BigInt(b.tweet.id);
- return x < 0 ? -1 : ( x > 0 ? 1 : 0 );
- }
Add Comment
Please, Sign In to add comment