Advertisement
milio48

test neo-parser.js

Apr 9th, 2025
252
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. /**
  2.  * neo-parser-dom.js
  3.  * Parses Blogger data rendered by the neo-data-provider theme (v6+)
  4.  * purely by reading the rendered DOM structure.
  5.  * Does NOT rely on window.neoBlogData injection.
  6.  *
  7.  * Usage:
  8.  * 1. Include this script in your Blogger theme (e.g., before </body>).
  9.  * 2. After the DOM is loaded, the parsed data will be available in window.parsedBloggerDataDOM.
  10.  */
  11.  
  12. /**
  13.  * Main function to parse data based on the rendered DOM.
  14.  * @returns {Object} An object containing blog info and page-specific data.
  15.  */
  16. function parseBloggerDataDOMOnly() {
  17.     // console.log("neo-parser-dom.js: Starting DOM-only parsing for:", window.location.href);
  18.  
  19.     let parsedData = {
  20.         blog: {},
  21.         pageType: 'unknown', // Default page type
  22.         page: null
  23.     };
  24.  
  25.     try {
  26.         // 1. Get Basic Blog Info from DOM
  27.         parsedData.blog = getBlogInfoFromDOM();
  28.  
  29.         // 2. Determine Page Type from DOM Structure
  30.         parsedData.pageType = determinePageTypeFromDOM(parsedData.blog.homepageUrl);
  31.         // console.log("neo-parser-dom.js: Determined Page Type:", parsedData.pageType);
  32.  
  33.         // 3. Call Specific Parser based on Determined Type
  34.         switch (parsedData.pageType) {
  35.             case 'homepage':
  36.             case 'archive':
  37.             case 'index': // Covers Label and Search initially
  38.                 // These pages use the post list table
  39.                 const listPageData = parseListPageDOM(parsedData.pageType);
  40.                 parsedData.page = listPageData.posts; // Store the array of posts
  41.                 // Refine pageType and add specific info (like label name or search query)
  42.                 if (parsedData.pageType === 'index') {
  43.                     parsedData.pageType = listPageData.subType; // 'label' or 'search'
  44.                     if (listPageData.subType === 'label') {
  45.                         parsedData.blog.pageName = listPageData.nameOrQuery;
  46.                     } else {
  47.                         parsedData.blog.searchQuery = listPageData.nameOrQuery;
  48.                     }
  49.                 } else if (parsedData.pageType === 'archive') {
  50.                      parsedData.blog.pageName = listPageData.nameOrQuery;
  51.                 }
  52.                  parsedData.blog.pageTitle = listPageData.title; // Get title from H2
  53.                 break;
  54.             case 'item':
  55.                 parsedData.page = parseItemPageDOM();
  56.                 parsedData.blog.pageTitle = parsedData.page?.title || document.title;
  57.                 break;
  58.             case 'static_page':
  59.                 parsedData.page = parseStaticPageDOM();
  60.                  parsedData.blog.pageTitle = parsedData.page?.title || document.title;
  61.                 break;
  62.             case 'error':
  63.                 parsedData.page = parseErrorPageDOM();
  64.                  parsedData.blog.pageTitle = parsedData.page?.title || document.title;
  65.                 break;
  66.             default: // Unknown
  67.                 parsedData.page = parseUnknownPageDOM();
  68.                  parsedData.blog.pageTitle = parsedData.page?.title || document.title;
  69.                 break;
  70.         }
  71.  
  72.     } catch (error) {
  73.         console.error("neo-parser-dom.js: Error during DOM parsing:", error);
  74.         parsedData.pageType = 'parsing_error';
  75.         parsedData.page = { error: "Failed to parse page data from DOM.", details: error.message, stack: error.stack };
  76.         // Attempt to get basic blog info even on error
  77.         if (!parsedData.blog.title) {
  78.              try { parsedData.blog = getBlogInfoFromDOM(); } catch(e){}
  79.         }
  80.     }
  81.  
  82.     // Add current URL regardless of parsing success/failure
  83.     parsedData.blog.url = window.location.href;
  84.  
  85.     // console.log("neo-parser-dom.js: Final Parsed Data (DOM Only):", parsedData);
  86.     return parsedData;
  87. }
  88.  
  89. /**
  90.  * Extracts basic blog info from header/DOM.
  91.  * @returns {Object} Object with title and homepageUrl.
  92.  */
  93. function getBlogInfoFromDOM() {
  94.     const blogInfo = {
  95.         title: document.title, // Default to document title
  96.         homepageUrl: null,
  97.         url: window.location.href
  98.     };
  99.     try {
  100.         const titleLink = document.querySelector('header.neo-site-header h1 a');
  101.         if (titleLink) {
  102.             blogInfo.title = titleLink.textContent.trim() || blogInfo.title;
  103.             blogInfo.homepageUrl = titleLink.href || null;
  104.         } else {
  105.             console.warn("neo-parser-dom.js: Could not find header title link (header.neo-site-header h1 a).");
  106.         }
  107.     } catch(e) {
  108.         console.error("neo-parser-dom.js: Error getting basic blog info.", e);
  109.     }
  110.     return blogInfo;
  111. }
  112.  
  113. /**
  114.  * Determines the page type by looking for specific container elements.
  115.  * @param {string|null} homepageUrl - The determined homepage URL.
  116.  * @returns {string} The determined page type ('homepage', 'item', 'static_page', etc.).
  117.  */
  118. function determinePageTypeFromDOM(homepageUrl) {
  119.     // Check for specific containers first
  120.     if (document.querySelector('div.neo-homepage-data')) return 'homepage'; // Specific div for homepage
  121.     if (document.querySelector('article.neo-post[itemscope]')) return 'item';
  122.     if (document.querySelector('div.neo-custom-page')) return 'static_page';
  123.     if (document.querySelector('div.neo-archive-data')) return 'archive';
  124.     if (document.querySelector('div.neo-index-data')) return 'index'; // Label or Search
  125.     if (document.querySelector('div.neo-error-page')) return 'error';
  126.     if (document.querySelector('div.neo-unknown-page')) return 'unknown';
  127.  
  128.     // Fallback checks (less specific)
  129.     // Check if current URL is homepage URL (if homepageUrl was found)
  130.     if (homepageUrl && window.location.href === homepageUrl) {
  131.          // Check if it has the post list table, could still be homepage if theme changed
  132.          if (document.getElementById('neo-data-table-posts')) {
  133.              return 'homepage';
  134.          }
  135.     }
  136.     // If it has the post list table but none of the specific containers, assume index/archive
  137.     if (document.getElementById('neo-data-table-posts')) {
  138.         console.warn("neo-parser-dom.js: Found post list table but no specific container (archive/index). Assuming 'index'.");
  139.         return 'index';
  140.     }
  141.  
  142.     console.warn("neo-parser-dom.js: Could not determine page type from known elements.");
  143.     return 'unknown'; // Default if nothing matches
  144. }
  145.  
  146.  
  147. /**
  148.  * Parses list pages (Homepage, Archive, Index) which contain the post list table.
  149.  * Extracts page title and specific type (label/search/archive name).
  150.  * @param {string} initialPageType - The initially determined page type ('homepage', 'archive', 'index').
  151.  * @returns {Object} Contains page title, posts array, subtype, and name/query if applicable.
  152.  */
  153. function parseListPageDOM(initialPageType) {
  154.     const result = {
  155.         title: document.title, // Default
  156.         posts: [],
  157.         subType: initialPageType, // 'homepage', 'archive', 'index'
  158.         nameOrQuery: null
  159.     };
  160.     let containerSelector;
  161.     let titleSelector = 'h2'; // Usually H2 for these pages
  162.  
  163.     switch(initialPageType) {
  164.         case 'homepage':
  165.             containerSelector = '.neo-homepage-data';
  166.             break;
  167.         case 'archive':
  168.             containerSelector = '.neo-archive-data';
  169.              result.subType = 'archive';
  170.             break;
  171.         case 'index':
  172.              containerSelector = '.neo-index-data';
  173.              // subtype (label/search) determined below
  174.             break;
  175.         default:
  176.              console.warn(`neo-parser-dom.js: Unexpected initialPageType in parseListPageDOM: ${initialPageType}`);
  177.              result.posts = parsePostListTableDOM(); // Try parsing table anyway
  178.              return result;
  179.     }
  180.  
  181.     const container = document.querySelector(containerSelector);
  182.     if (container) {
  183.         const titleElement = container.querySelector(titleSelector);
  184.         if (titleElement) {
  185.             result.title = titleElement.textContent.trim();
  186.             // Try to extract specific name/query from title for archive/index
  187.             if (initialPageType === 'archive') {
  188.                 // Example: "Archive: Monthly Archive - Post Data" -> "Monthly Archive"
  189.                 const match = result.title.match(/^Archive:\s*(.*?)\s*-\s*Post Data$/i);
  190.                 result.nameOrQuery = match ? match[1] : result.title; // Fallback to full title
  191.             } else if (initialPageType === 'index') {
  192.                  // Example: "Search Results: "query" - Post Data" -> "query"
  193.                 const searchMatch = result.title.match(/^Search Results:\s*"(.*?)"\s*-\s*Post Data$/i);
  194.                  // Example: "Label: LabelName - Post Data" -> "LabelName"
  195.                 const labelMatch = result.title.match(/^Label:\s*(.*?)\s*-\s*Post Data$/i);
  196.  
  197.                 if (searchMatch) {
  198.                     result.subType = 'search';
  199.                     result.nameOrQuery = searchMatch[1];
  200.                 } else if (labelMatch) {
  201.                     result.subType = 'label';
  202.                     result.nameOrQuery = labelMatch[1];
  203.                 } else {
  204.                      result.subType = 'index'; // Could not determine subtype
  205.                      result.nameOrQuery = result.title; // Fallback
  206.                 }
  207.             }
  208.         } else {
  209.              console.warn(`neo-parser-dom.js: Could not find title element (${titleSelector}) in ${containerSelector}.`);
  210.         }
  211.         // Parse the table within the container (or globally if needed)
  212.         result.posts = parsePostListTableDOM(container); // Pass container to scope search
  213.     } else {
  214.          console.warn(`neo-parser-dom.js: Could not find container element: ${containerSelector}. Parsing table globally.`);
  215.          result.posts = parsePostListTableDOM(); // Fallback to global search
  216.     }
  217.  
  218.     return result;
  219. }
  220.  
  221.  
  222. /**
  223.  * Parses the post list table (#neo-data-table-posts). DOM ONLY version.
  224.  * @param {Element} [container=document] - Optional container element to search within.
  225.  * @returns {Array<Object>} Array of post objects or empty array.
  226.  */
  227. function parsePostListTableDOM(container = document) {
  228.     const table = container.querySelector('#neo-data-table-posts'); // Search within container or document
  229.     if (!table) {
  230.         console.warn("neo-parser-dom.js: Table #neo-data-table-posts not found within the scope.", container === document ? "Document" : container);
  231.         return []; // Return empty array
  232.     }
  233.  
  234.     const rows = table.querySelectorAll('tbody tr');
  235.     const posts = [];
  236.     // Expected header order: ID, Title, URL, Published ISO, Published Formatted, Author, Labels, Snippet, First Image URL
  237.  
  238.     rows.forEach((row, rowIndex) => {
  239.         const cells = row.querySelectorAll('td');
  240.         if (cells.length === 1 && cells[0].hasAttribute('colspan')) return; // Skip "No posts" row
  241.         if (cells.length !== 9) {
  242.             console.warn(`neo-parser-dom.js: Row ${rowIndex} in table skipped: Expected 9 cells, found ${cells.length}.`, row);
  243.             return;
  244.         }
  245.  
  246.         const post = {};
  247.         try {
  248.             post.postId = row.dataset.postId || cells[0]?.textContent.trim() || null;
  249.             post.title = cells[1]?.textContent.trim() || '';
  250.             post.url = cells[2]?.textContent.trim() || ''; // Plain text URL
  251.             post.publishedIso = cells[3]?.querySelector('time')?.getAttribute('datetime') || cells[3]?.textContent.trim() || '';
  252.             post.publishedFormatted = cells[4]?.textContent.trim() || '';
  253.             post.author = cells[5]?.textContent.trim() || '';
  254.  
  255.             const labelListItems = cells[6]?.querySelectorAll('li');
  256.             post.labels = labelListItems ? Array.from(labelListItems).map(li => ({
  257.                 name: li.getAttribute('neo-label-name') || li.textContent.trim(),
  258.                 url: li.getAttribute('data-label-url') || null
  259.             })) : [];
  260.  
  261.             post.snippet = cells[7]?.textContent.trim() || '';
  262.             const imageUrl = cells[8]?.textContent.trim();
  263.             post.firstImageUrl = (imageUrl && imageUrl !== '(No image)') ? imageUrl : null;
  264.  
  265.             posts.push(post);
  266.         } catch (cellError) {
  267.             console.warn(`neo-parser-dom.js: Error parsing cells in table row ${rowIndex}:`, cellError, row);
  268.         }
  269.     });
  270.     return posts;
  271. }
  272.  
  273. /**
  274.  * Parses data from an Item page (Single Post). DOM ONLY version.
  275.  * @returns {Object|null} Post detail object or null.
  276.  */
  277. function parseItemPageDOM() {
  278.     const article = document.querySelector('article.neo-post[itemscope]');
  279.     if (!article) {
  280.          console.warn("neo-parser-dom.js: Article element for Item page not found.");
  281.          return null;
  282.     }
  283.  
  284.     const post = {};
  285.     const contentDiv = article.querySelector('div[itemprop="articleBody"]');
  286.  
  287.     post.postId = contentDiv?.id?.replace('neo-post-body-', '') || null;
  288.     post.title = article.querySelector('h1[itemprop="headline"]')?.textContent.trim() || document.title; // Fallback to doc title
  289.     post.url = window.location.href; // Current URL
  290.  
  291.     const timeEl = article.querySelector('time[itemprop="datePublished"]');
  292.     post.publishedIso = timeEl?.getAttribute('datetime') || '';
  293.     post.publishedFormatted = timeEl?.textContent.trim() || '';
  294.  
  295.     const labelLinks = article.querySelectorAll('.neo-post-labels a[itemprop="keywords"]');
  296.     post.labels = labelLinks ? Array.from(labelLinks).map(a => ({
  297.         name: a.textContent.trim(),
  298.         url: a.href || null
  299.     })) : [];
  300.  
  301.     post.bodyHtml = contentDiv?.innerHTML || '';
  302.     // post.author = article.querySelector('[itemprop="author"]')?.textContent.trim() || ''; // Add if author itemprop exists
  303.  
  304.     return post;
  305. }
  306.  
  307. /**
  308.  * Parses data from a Static page. DOM ONLY version.
  309.  * @returns {Object|null} Page detail object or null.
  310.  */
  311. function parseStaticPageDOM() {
  312.     const pageDiv = document.querySelector('div.neo-custom-page');
  313.     if (!pageDiv) {
  314.         console.warn("neo-parser-dom.js: Container element for Static page not found.");
  315.         return null;
  316.     }
  317.  
  318.     const page = {};
  319.     const contentDiv = pageDiv.querySelector('div.neo-page-content');
  320.  
  321.     page.pageId = contentDiv?.id?.replace('neo-post-body-', '') || null;
  322.     page.title = pageDiv.querySelector('h1.neo-page-title')?.textContent.trim() || document.title; // Fallback
  323.     page.url = window.location.href;
  324.     page.bodyHtml = contentDiv?.innerHTML || '';
  325.  
  326.     return page;
  327. }
  328.  
  329. /**
  330.  * Parses data from an Error page. DOM ONLY version.
  331.  * @returns {Object} Page detail object.
  332.  */
  333. function parseErrorPageDOM() {
  334.      const pageDiv = document.querySelector('div.neo-error-page');
  335.      const title = pageDiv?.querySelector('h1')?.textContent.trim() || "Error";
  336.      const message = pageDiv?.querySelector('p')?.textContent.trim() || "Page not found.";
  337.      return { title, message };
  338. }
  339.  
  340. /**
  341.  * Parses data from an Unknown page type. DOM ONLY version.
  342.  * @returns {Object} Page detail object.
  343.  */
  344. function parseUnknownPageDOM() {
  345.      const pageDiv = document.querySelector('div.neo-unknown-page');
  346.      const title = pageDiv?.querySelector('h1')?.textContent.trim() || "Unknown Page";
  347.      const message = pageDiv?.querySelector('p')?.textContent.trim() || "Unknown page type detected.";
  348.      return { title, message };
  349. }
  350.  
  351.  
  352. // --- Execution ---
  353. document.addEventListener('DOMContentLoaded', () => {
  354.     // console.log("neo-parser-dom.js: DOM fully loaded. Starting DOM-only parser...");
  355.     // Parse the data using only DOM and store it globally
  356.     window.parsedBloggerDataDOM = parseBloggerDataDOMOnly();
  357.  
  358.     // Example: Access the data
  359.     // console.log("Accessing DOM-parsed data:", window.parsedBloggerDataDOM);
  360. });
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement