I am creating a script to scrape Amazon products, and I am using proxy rotation. The problem I am facing is that when Playwright creates the browser and opens the URL, it takes the location of the proxy. I want to use the same location all the time because otherwise, the products and offers vary from one location to another.
I wanted to ask if anyone knows of a parameter to add to the URL or any way to make Playwright always open Amazon with my location configuration.
I tried automating it so that every time Playwright enters, it clicks to change the location, but the problem with this is that with proxies, the structure and style of Amazon vary according to the region it loads, and sometimes it doesn’t even allow changing the location.
I also adjusted the cookies and headers, but that didn’t work:
import { chromium } from 'playwright-extra';
import stealth from 'puppeteer-extra-plugin-stealth';
import randomUseragent from 'random-useragent';
import fs from 'fs/promises';
import path from 'path';
import axios from 'axios';
import { router_selections } from './routerSelectors.js';
const stealthPlugin = stealth();
stealthPlugin.enabledEvasions.delete('user-agent-override');
stealthPlugin.enabledEvasions.delete('webdriver');
chromium.use(stealthPlugin);
...
const COOKIES = [
{ name: 'i18n-prefs', value: 'EUR', domain: '.amazon.es', path: '/' },
{ name: 'ubid-main', value: '...', domain: '.amazon.es', path: '/' },
{ name: 'lc-main', value: 'es_ES', domain: '.amazon.es', path: '/' },
{ name: 'session-id', value: '...', domain: '.amazon.es', path: '/' },
{ name: 'session-id-apay', value: '...', domain: '.amazon.es', path: '/' },
{ name: 'session-id-time', value: '...', domain: '.amazon.es', path: '/' },
{ name: 'sp-cdn', value: '...', domain: '.amazon.es', path: '/' },
{name: 'session-token',value: '...',domain: '.amazon.es',path: '/'},
{ name: 'skin', value: 'noskin', domain: '.amazon.es', path: '/' },
{ name: 'csm-hit', value: '...', domain: '.amazon.es', path: '/' },
];
...
async function scrapAmazon(url, maxRetries = 3) {
let browser, context, page;
let retryCount = 0;
let proxy, userAgent;
let page_reload = false;
let first_time = true;
const userDataDir = path.join(process.cwd(), 'user_data');
await fs.mkdir(userDataDir, { recursive: true });
while (retryCount < maxRetries) {
try {
if (!page_reload && first_time) {
proxy = await getProxy();
userAgent = randomUseragent.getRandom();
browser = await chromium.launchPersistentContext(userDataDir, {
headless: false,
proxy: { server: `${proxy}` },
viewport: {
width: 1920 + Math.floor(Math.random() * 100),
height: 1080 + Math.floor(Math.random() * 100),
},
userAgent: userAgent,
geolocation: { longitude: -3.70379, latitude: 40.4165 },
locale: 'es-ES',
timezoneId: 'Europe/Madrid',
permissions: ['geolocation'],
extraHTTPHeaders: {
'Accept-Language': 'es-ES,es;q=0.9,en;q=0.8',
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Upgrade-Insecure-Requests': '1',
Connection: 'keep-alive',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
},
args: ['--disable-session-crashed-bubble', '--disable-web-security', '--allow-running-insecure-content', '--ignore-certificate-errors', '--disable-gpu', '--ignore-certifcate-errors-spki-list'],
ignoreHTTPSErrors: true,
});
for (const cookie of COOKIES) {
try {
await browser.addCookies([cookie]);
} catch (error) {
console.log('Error adding cookie:', cookie.name);
console.error('Error adding cookie:', error.message);
}
}
page = await browser.newPage();
await page.goto(url, { waitUntil: 'load', timeout: 60000 });
await page.waitForTimeout(1);
try {
await router_selections(page);
} catch (error) {
console.log(`Error in router o captcha selections: ${error.message}`);
}
}
await page.waitForTimeout(10000);
const products = await page.$$eval('div[data-asin]:not([data-asin=""])', (elements) => {
return elements.map((el) => ({
asin: el.getAttribute('data-asin'),
title: el.querySelector('h2 span')?.textContent.trim() || 'No title',
price: el.querySelector('.a-price-whole')?.textContent.trim() || 'No price',
rating: el.querySelector('.a-icon-star-small')?.textContent.trim() || 'No rating',
reviews: el.querySelector('span[aria-label$="stars"]')?.nextElementSibling?.textContent.trim().replace(/[(),]/g, '') || 'No reviews',
}));
});
products.forEach((product) => {
console.log('Product: ', product);
});
await page.waitForTimeout(900000);
console.log(`Scraped ${products.length} products successfully.`);
await page.reload({ waitUntil: 'networkidle', timeout: 60000 });
first_time = false;
page_reload = true;
} catch (error) {
console.log(error.message);
retryCount++;
if (page) await page.close().catch(() => {});
if (browser) await browser.close().catch(() => {});
first_time = true;
page_reload = false;
await new Promise((resolve) => setTimeout(resolve, 3000));
}
}
...
}
1