Files
Crawler/crawler-fb/crawler-post.js
2025-11-21 10:42:37 +07:00

347 lines
10 KiB
JavaScript

require("dotenv").config();
const { Builder, By, until } = require("selenium-webdriver");
const chrome = require("selenium-webdriver/chrome");
const fs = require("fs");
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
async function initializeDriver() {
const options = new chrome.Options();
options.addArguments("--start-maximized");
options.addArguments("--no-sandbox");
options.addArguments("--disable-dev-shm-usage");
options.addArguments("--disable-blink-features=AutomationControlled");
options.addArguments(
"--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
);
return new Builder().forBrowser("chrome").setChromeOptions(options).build();
}
async function safeFindElement(element, by) {
try {
return await element.findElement(by);
} catch (error) {
return null;
}
}
async function loginFacebook(driver, username, password, cookieFilePath) {
await driver.get("https://www.facebook.com/login");
try {
const emailInput = await driver.wait(
until.elementLocated(By.name("email")),
10000
);
await emailInput.clear();
await emailInput.sendKeys(username);
const passInput = await driver.wait(
until.elementLocated(By.name("pass")),
10000
);
await passInput.clear();
await passInput.sendKeys(password);
const loginBtn = await driver.wait(
until.elementLocated(By.name("login")),
10000
);
await loginBtn.click();
await sleep(5000);
const currentUrl = await driver.getCurrentUrl();
if (currentUrl.includes("login") || currentUrl.includes("checkpoint")) {
throw new Error("Login failed!");
}
await saveCookies(driver, cookieFilePath);
console.log("Auto login successful!");
return true;
} catch (e) {
console.log("Auto login error:", e.message);
return false;
}
}
async function loadCookies(driver, cookieFilePath) {
try {
if (fs.existsSync(cookieFilePath)) {
const cookies = JSON.parse(fs.readFileSync(cookieFilePath, "utf8"));
await driver.get("https://www.facebook.com");
await sleep(2000);
for (const cookie of cookies) {
try {
await driver.manage().addCookie(cookie);
} catch (error) {
console.log(`Could not add cookie: ${cookie.name}`);
}
}
console.log("Cookies loaded successfully!");
await driver.navigate().refresh();
await sleep(3000);
return true;
}
} catch (error) {
console.log("Error loading cookies:", error.message);
}
return false;
}
async function saveCookies(driver, cookieFilePath) {
try {
const cookies = await driver.manage().getCookies();
fs.writeFileSync(cookieFilePath, JSON.stringify(cookies, null, 2));
console.log("Cookies saved successfully!");
return true;
} catch (error) {
console.log("Error saving cookies:", error.message);
return false;
}
}
async function getFirstPostBySelector(driver) {
const postSelectors = [
'[data-pagelet*="FeedUnit"]',
'[role="article"]',
'[data-testid="fbfeed_story"]',
".userContentWrapper",
"[data-ft]",
'div[data-pagelet="ProfileTimeline"]',
'[data-pagelet="ProfileTimeline"] > div > div',
'div[data-ad-preview="message"]',
];
let firstPost = null;
let usedSelector = "";
for (const selector of postSelectors) {
try {
const posts = await driver.findElements(By.css(selector));
if (posts.length > 0) {
firstPost = posts[0];
usedSelector = selector;
break;
}
} catch (error) {
continue;
}
}
if (!firstPost) {
console.log("No post found!");
return null;
}
try {
for (let i = 0; i < 3; i++) {
await driver.executeScript(
`
var post = arguments[0];
var btns = post.querySelectorAll('div[role="button"], button, span');
for (var j=0; j<btns.length; j++) {
if (btns[j].innerText && (btns[j].innerText.trim() === 'See more' || btns[j].innerText.trim() === 'Xem thêm')) {
btns[j].click();
}
}
`,
firstPost
);
await sleep(1200);
const innerHTML = await firstPost.getAttribute("innerHTML");
if (!innerHTML.includes("See more") && !innerHTML.includes("Xem thêm")) {
break;
}
}
} catch (e) {}
return { firstPost, usedSelector };
}
(async function main() {
let driver = await initializeDriver();
const cookieFilePath = "facebook_cookies.json";
const targetUrl =
process.argv[2] || "https://www.facebook.com/logisticsarena.bacib.tdtu";
const username = process.argv[4] || process.env.FB_USERNAME || "";
const password = process.argv[5] || process.env.FB_PASSWORD || "";
// const timeRetry = 5 * 60 * 1000; // 5 minutes
const timeRetry = 20 * 1000;
if (!username || !password) {
console.log(
"Missing username or password! Pass via argv or set FB_USERNAME, FB_PASSWORD environment variables."
);
process.exit(1);
}
let postSet = new Set();
let contentSet = new Set();
if (fs.existsSync("facebook_first_post.json")) {
try {
const oldData = JSON.parse(
fs.readFileSync("facebook_first_post.json", "utf8")
);
if (Array.isArray(oldData)) {
oldData.forEach((item) => {
if (item.postLink) postSet.add(item.postLink.split("?")[0]);
if (item.content) contentSet.add(item.content.trim());
});
} else {
if (oldData.postLink) postSet.add(oldData.postLink.split("?")[0]);
if (oldData.content) contentSet.add(oldData.content.trim());
}
} catch (e) {}
}
try {
let cookiesLoaded = await loadCookies(driver, cookieFilePath);
let isLoggedIn = false;
if (cookiesLoaded) {
try {
const currentUrl = await driver.getCurrentUrl();
if (
!currentUrl.includes("login") &&
!currentUrl.includes("checkpoint")
) {
isLoggedIn = true;
}
} catch (e) {}
}
if (!isLoggedIn) {
let loginSuccess = await loginFacebook(
driver,
username,
password,
cookieFilePath
);
if (!loginSuccess) {
process.exit(1);
}
}
while (true) {
await driver.get(targetUrl);
await sleep(5000);
const postResult = await getFirstPostBySelector(driver);
if (!postResult) {
console.log("No post found!");
} else {
const { firstPost, usedSelector } = postResult;
let content = "";
let contentElement = await safeFindElement(
firstPost,
By.css('div[data-ad-preview="message"]')
);
if (contentElement) content = await contentElement.getText();
if (!content) {
contentElement = await safeFindElement(
firstPost,
By.css(".userContent")
);
if (contentElement) content = await contentElement.getText();
}
if (!content) {
contentElement = await safeFindElement(
firstPost,
By.css('div[role="article"]')
);
if (contentElement) content = await contentElement.getText();
}
if (!content) {
contentElement = await safeFindElement(
firstPost,
By.css('div[data-testid="post_message"]')
);
if (contentElement) content = await contentElement.getText();
}
if (!content) {
content = await firstPost.getText();
}
if (!content) content = "None content";
let time = "";
try {
let timeElement = await safeFindElement(
firstPost,
By.css('a[aria-label][href*="/posts/"]')
);
time = timeElement ? await timeElement.getText() : "";
} catch (e) {
time = "";
}
let postLink = "";
try {
let linkElements = await firstPost.findElements(By.css("a"));
for (const el of linkElements) {
const href = await el.getAttribute("href");
if (
href &&
(href.includes("/posts/") ||
href.includes("/permalink/") ||
href.includes("fbid="))
) {
postLink = href;
break;
}
}
} catch (e) {}
let postLinkKey = postLink ? postLink.split("?")[0] : "";
let images = [];
try {
let imageElements = await firstPost.findElements(By.css("img"));
for (const img of imageElements) {
const src = await img.getAttribute("src");
if (
src &&
src.startsWith("http") &&
!src.includes("data:image") &&
!src.includes("static.xx")
) {
images.push(src);
}
}
} catch (e) {}
if (
(postLinkKey && postSet.has(postLinkKey)) ||
(content && contentSet.has(content.trim()))
) {
console.log("Post already fetched or content duplicated, skipping!");
} else {
let newPost = {
url: targetUrl,
selector_used: usedSelector,
content: content,
postLink: postLink,
images: images,
timestamp: new Date().toISOString(),
time: time,
};
let postsArr = [];
if (fs.existsSync("facebook_first_post.json")) {
try {
const oldData = JSON.parse(
fs.readFileSync("facebook_first_post.json", "utf8")
);
if (Array.isArray(oldData)) {
postsArr = oldData;
} else {
postsArr = [oldData];
}
} catch (e) {}
}
postsArr.push(newPost);
fs.writeFileSync(
"facebook_first_post.json",
JSON.stringify(postsArr, null, 2)
);
postSet.add(postLinkKey);
contentSet.add(content.trim());
console.log("Saved latest post to facebook_first_post.json");
}
}
await sleep(timeRetry);
}
} catch (error) {
console.error("Error:", error.message);
} finally {
setTimeout(async () => {
await driver.quit();
console.log("Browser closed");
}, 10000);
}
})();