logo科技微讯

监测 Reddit 所有帖子和评论

作者:科技微讯
日期:2021-01-19
📝 笔记

原文

F5Bot is a free service that emails you when your selected keywords are mentioned on Reddit, Hacker News, or Lobsters. Use it to monitor your brand, your projects, or just topics that you're interested in.

f5bot 解释了它是如何监测 reddit 所有 post 和 comment 的。简单地说:

  • 先确定一个 post id,然后把这个 id 从 base36 转换成 base10 的数字;
  • 然后根据这个数字不断减一,获取过去 2000 个 post 的 id,转换成 base36;
  • 用下面这个 api,把 2000 个 id 分成 20 份,每一份附在这个 api 后获取数据;
const id = "";
const api = `https://api.reddit.com/api/info.json?id=${id}`;
  • 获取到的 post 数据可能没有 2000 条,其中一个原因是 private subreddit 的帖子内容无法获取;
  • 我试了一下,一分钟大概有 600 左右的新帖子,考虑到高峰期可能远远不止,所以可以每隔 1 分钟获取 2000 个帖子的数据;
  • 把 2000 条帖子的关键数据去重保存到数据库,就可以不断记录所有帖子;
  • 每 10 分钟从数据库获取过去 2 小时到 2 小时十分钟这段时间的帖子,获取它们的 ups 数量,就可以判断两小时后哪些帖子成为热门帖子;
  • f5bot 可能会用数千个关键词搜索每一个 post 的内容,为了加快搜索速度,用了 Aho–Corasick algorithm 算法,提前把关键词处理成 tree structure
const got = require("got");
var CronJob = require("cron").CronJob;
const arg = process.argv;
const mongoose = require("mongoose");

const connectDB = async function () {
  await mongoose.connect(
    `mongodb+srv://test:dmmongoDB3.14@cluster0.ir1xt.azure.mongodb.net/reddit?retryWrites=true&w=majority`,
    {
      useNewUrlParser: true,
      useUnifiedTopology: true,
      useFindAndModify: false,
    }
  );
  const connection = mongoose.connection;
  connection.on("error", () => {
    console.log("数据库连接失败...");
  });
  connection.once("open", () => {
    console.log("数据库连接成功!");
  });
  const allVideoPostSchema = new mongoose.Schema({}, { strict: false });
  const allVideoPostModel =
    mongoose.models.allvideopost ||
    mongoose.model("allvideopost", allVideoPostSchema);
  return {
    allVideoPostModel,
    connection,
  };
};

async function app() {
  const { allVideoPostModel, connection } = await connectDB();
  const url = "https://www.reddit.com/r/all/new/.json";
  console.time("a");
  const newPosts = await got(url).json();
  console.timeEnd("a");
  const [latestPost] = newPosts.data.children.sort(
    (b, a) => b.data.created_utc - a.data.created_utc
  );
  const postID = latestPost.data.id;
  const postNumber = parseInt(postID, 36);
  const urls = [];
  const idsList = [];
  console.time("b");
  for (let x = 0; x < 20; x++) {
    var ids = "";
    for (let y = 0; y < 100; y++) {
      const postID = (postNumber - (y + x * 100)).toString(36);
      ids = y === 0 ? `t3_${postID}` : `${ids},t3_${postID}`;
      idsList.push(postID);
    }
    urls.push(`https://api.reddit.com/api/info.json?id=${ids}`);
  }
  console.timeEnd("b");
  async function getDataFromURLs(urls) {
    const promiseArray = urls.map(async (url) => {
      const posts = await got(url)
        .json()
        .then((res) => res.data.children)
        .then((posts) => {
          return posts.map((post) => {
            const post_hint = post.data.post_hint || "";
            return {
              postID: post.data.id,
              created_utc: post.data.created_utc,
              post_hint: post_hint,
            };
          });
        })
        .catch((e) => console.log("出错:", e));
      return posts;
    });
    const posts = await Promise.all(promiseArray);
    return posts.flat();
  }
  console.time("c");
  const posts = await getDataFromURLs(urls);
  console.timeEnd("c");
  console.time("d");
  const promiseArray = posts.map((post) => {
    return allVideoPostModel
      .findOneAndUpdate(
        {
          postID: post.postID,
        },
        post,
        {
          new: true,
          lean: true,
          upsert: true,
        }
      )
      .exec();
  });
  await Promise.all(promiseArray);
  console.timeEnd("d");
  connection.close(() => console.log("成功关闭数据库"));
}

if (arg.includes("now")) {
  app();
} else {
  var job = new CronJob("0 */1 * * * *", function () {
    console.log(new Date());
    app();
  });
  job.start();
}
donation赞赏
thumbsup0
thumbsdown0
暂无评论