ARTICLE AD BOX
Hi Stack Overflow community,
I’ve been using Obsidian for a long time to organize my stories, notes, projects, and collections—books, manga, movies, code, basically anything I want to catalog. I love collecting and learning along the way, and Obsidian works great because it’s fast, low on storage, and supports plugins, links, and images.
However, I’m struggling with automatically importing large amounts of data into my vault as Markdown files. Specifically, I want to import books from Goodreads lists (for example: The Most Disturbing Books Ever Written), including:
Title
Author
Cover image (coverUrl)
Average rating
Genre
Avoid creating duplicate notes if a book already exists in the vault
I’ve tried using requests + BeautifulSoup, but Goodreads now loads the list content dynamically with JavaScript, so my script doesn’t find any books. I switched to Selenium, which works for scraping the page, but I’m wondering if there’s a more efficient, Pythonic, or scalable way to do this—especially if I want to import multiple lists, hundreds of books, or other types of media (movies, manga, etc.) later.
I would greatly appreciate any guidance, examples, or libraries that could help me:
Scrape Goodreads lists reliably, even with JS-loaded content
Export each item to Markdown with frontmatter for Obsidian
Handle large collections efficiently (avoid duplicates, checkpoint progress)
Current setup in Obsidian (example DataviewJS code for panels and filtered notes):
dataviewjs
/=== CONFIGURAÇÃO ===/
const folders = [
`"3- Bem estar/Hobbies e Inspirações/Coleções"`,
`"2- Maestria/Conhecimento"`,
`"1- Acadêmico"`
];
const query = folders.join(" or ");
const cores = {
"Livros": "#1D4ED8",
"Mangás": "#F43F5E",
"HQs": "#F97316",
"Webtoons": "#06B6D4",
"Manhwa": "#8B5CF6",
"Filmes": "#DC2626",
"Séries": "#2563EB",
"Músicas": "#EAB308",
"Arte": "#10B981",
"Esculturas": "#78716C",
"Obras de Arte": "#A855F7",
"Animals": "#84CC16",
"Creatures": "#9333EA",
"Jogos de Tabuleiro": "#B45309",
"Video Games": "#0EA5E9",
"Wiki": "#15803D",
"Jogos": "#3B82F6",
"Documentos": "#64748B",
"Outros": "#9CA3AF",
"Área Acadêmica": "#0D9488",
"Área de Conhecimento": "#1E40AF",
"Área Profissional": "#7C3AED",
"Anotação": "#92487A",
"Reflexão": "#6B7280",
"Citação": "#9CA3AF",
"Experimento Científico": "#16A34A"
};
const PAGE_SIZE = 21;
let currentPage = 1;
let filteredItems = [];
/=== FUNÇÕES BASE ===/
function normalizeList(value) {
if (!value) return [];
return Array.isArray(value) ? value : [value];
}
function joinList(value) {
return normalizeList(value).join(", ");
}
function normalizarTipo(typeRaw) {
if (!typeRaw) return "Outros";
const t = String(typeRaw).toLowerCase().trim();
const map = {
"livro": "Livros", "livros": "Livros", "book": "Livros", "books": "Livros",
"manga": "Mangás", "mangá": "Mangás", "mangas": "Mangás", "mangás": "Mangás",
"hq": "HQs", "comic": "HQs", "comics": "HQs",
"webtoon": "Webtoons", "webtoons": "Webtoons",
"manhwa": "Manhwa", "manhwas": "Manhwa",
"filme": "Filmes", "movie": "Filmes", "movies": "Filmes",
"série": "Séries", "series": "Séries",
"musicrelease": "Músicas", "música": "Músicas", "musica": "Músicas", "music": "Músicas",
"arte": "Arte", "art": "Arte",
"escultura": "Esculturas", "sculpture": "Esculturas",
"obra de arte": "Obras de Arte", "artwork": "Obras de Arte",
"ser": "Animals", "animal": "Animals", "animals": "Animals",
"criatura": "Creatures", "criaturas": "Creatures", "creature": "Creatures", "creatures": "Creatures", "entity": "Creatures",
"jogo": "Jogos", "game": "Jogos", "games": "Jogos",
"jogo de tabuleiro": "Jogos de Tabuleiro", "board game": "Jogos de Tabuleiro",
"videogame": "Video Games", "video game": "Video Games",
"wiki": "Wiki", "wikipedia": "Wiki",
"documento": "Documentos", "document": "Documentos",
"acadêmico": "Área Acadêmica", "academico": "Área Acadêmica",
"área acadêmica": "Área Acadêmica", "area academica": "Área Acadêmica",
"conhecimento": "Área de Conhecimento", "área de conhecimento": "Área de Conhecimento",
"profissional": "Área Profissional", "área profissional": "Área Profissional",
"anotação": "Anotação", "note": "Anotação",
"reflexão": "Reflexão",
"citação": "Citação", "quote": "Citação",
"experimento científico": "Experimento Científico", "experimento": "Experimento Científico"
};
return map[t] ?? "Outros";
}
function normalizarSubTipo(subTypeRaw) {
if (!subTypeRaw) return "";
return subTypeRaw.charAt(0).toUpperCase() + subTypeRaw.slice(1);
}
function renderRating(nota) {
if (nota === undefined || nota === null) return "";
const r = Math.max(0, Math.min(10, Number(nota))) / 2;
const full = Math.floor(r);
const half = r - full >= 0.5 ? 1 : 0;
const empty = 5 - full - half;
return "★".repeat(full) + "⯪".repeat(half) + "☆".repeat(empty);
}
/=== CARREGAMENTO ===/
let items = [];
for (let p of dv.pages(query)) {
const type = normalizarTipo(p.type ?? "");
const subType = normalizarSubTipo(p.subType ?? "");
const creators = joinList([
...normalizeList(p.author ?? p.autor ?? p.writer),
...normalizeList(p.director ?? p.diretor),
...normalizeList(p.artist ?? p.artista)
].filter(Boolean));
items.push({
...p,
cover: p.cover ?? p.coverUrl ?? p.poster ?? "",
title: p.title ?? p.portugueseTitle ?? p.englishTitle ?? p.file.name,
portugueseTitle: p.portugueseTitle ?? "",
englishTitle: p.englishTitle ?? "",
onlineRating: p.onlineRating ?? null,
creators,
type,
subType,
status: String(p.status ?? "").trim(),
series: p.series ?? "",
rating: p.rating ?? null,
created: p.file.ctime
});
}
/=== UI CONTROLS ===/
dv.container.innerHTML = `
<div class="dv-gallery-controls">
<select id="dv-g-type"><option value="">Todos os tipos</option></select>
<select id="dv-g-subtype"><option value="">Todos os subtipos</option></select>
<select id="dv-g-status"><option value="">Todos os status</option></select>
<select id="dv-g-image">
<option value="">Todas</option>
<option value="with">Com Imagem</option>
<option value="without">Sem Imagem</option>
</select>
<select id="dv-g-order">
<option value="title">Título</option>
<option value="created">Data de criação</option>
<option value="fileName">Nome do arquivo</option>
</select>
<select id="dv-g-direction">
<option value="asc">Crescente ↑</option>
<option value="desc">Decrescente ↓</option>
</select>
<button id="dv-g-reset">Resetar</button>
</div>
<div id="dv-gallery-grid" class="dv-grid"></div>
<div class="dv-gallery-pages">
<button id="dv-g-prev">⬅️ Página anterior</button>
<span id="dv-g-page">1</span>
<button id="dv-g-next">Próxima página ➡️</button>
<button id="dv-g-top">🔝 Topo</button>
</div>
`;
/=== POPULA FILTROS ===/
function populateFilters() {
const typeSelect = document.getElementById("dv-g-type");
[...new Set(items.map(i => i.type))].forEach(t => typeSelect.insertAdjacentHTML('beforeend', `<option value="${t}">${t}</option>`));
const subSelect = document.getElementById("dv-g-subtype");
[...new Set(items.map(i => i.subType).filter(Boolean))].forEach(t => subSelect.insertAdjacentHTML('beforeend', `<option value="${t}">${t}</option>`));
const statusSelect = document.getElementById("dv-g-status");
[...new Set(items.map(i => i.status).filter(Boolean))].forEach(t => statusSelect.insertAdjacentHTML('beforeend', `<option value="${t}">${t}</option>`));
}
/=== RENDER GRID ===/
function render(list) {
const grid = document.getElementById("dv-gallery-grid");
grid.innerHTML = "";
const start = (currentPage - 1) * PAGE_SIZE;
const end = start + PAGE_SIZE;
const pageItems = list.slice(start, end);
pageItems.forEach(p => {
const typeColor = cores[p.type] ?? cores["Outros"];
const cover = p.cover || "https://placehold.co/400x600?text=Sem+Capa";
const rating = renderRating(p.rating);
const cardBg = p.subType ? `rgba(${parseInt(typeColor.slice(1,3),16)},${parseInt(typeColor.slice(3,5),16)},${parseInt(typeColor.slice(5,7),16)},0.15)`
: "var(--background-secondary)";
const tooltip = `
📌 Original: ${p.title}
🇧🇷 PT: ${p.portugueseTitle || "—"}
🇺🇸 EN: ${p.englishTitle || "—"}
Tipo: ${p.type}
Subtipo: ${p.subType || "—"}
Status: ${p.status || "—"}
`.trim();
const card = document.createElement("div");
card.className = "dv-card";
card.style.border = `2px solid ${typeColor}`;
card.style.background = cardBg;
card.title = tooltip;
card.innerHTML = `
<div class="dv-card-media">
<img src="${cover}">
<div class="dv-badges">
<span class="dv-card-badge" style="background:${typeColor};">${p.type}</span>
${p.subType ? `<span class="dv-card-subtype" style="border:1px solid ${typeColor}; color:${typeColor}; background:rgba(255,255,255,0.9);">${p.subType}</span>` : ""}
</div>
</div>
<div class="dv-card-body">
<div class="dv-card-title">${p.title}</div>
<div class="dv-card-sub">${p.creators}</div>
<div class="dv-card-rating" style="color:${typeColor};">
${rating}${p.onlineRating ? `<div style="font-size:0.7rem; opacity:0.8;">🌐 ${p.onlineRating}/10</div>` : ""}
</div>
${p.status ? `<div class="dv-card-status">${p.status}</div>` : ""}
</div>
`;
card.onclick = () => app.workspace.openLinkText(p.file.path, '/', false);
grid.appendChild(card);
});
document.getElementById("dv-g-page").textContent = currentPage;
}
/=== FILTROS E NAVEGAÇÃO ===/
function applyFilters() {
const typ = document.getElementById("dv-g-type").value;
const subtyp = document.getElementById("dv-g-subtype").value;
const stat = document.getElementById("dv-g-status").value;
const imgFilter = document.getElementById("dv-g-image").value;
const order = document.getElementById("dv-g-order").value;
const dir = document.getElementById("dv-g-direction").value;
filteredItems = items.filter(p =>
(!typ || p.type === typ) &&
(!subtyp || p.subType === subtyp) &&
(!stat || p.status === stat) &&
(!imgFilter || (imgFilter === "with" && p.cover) || (imgFilter === "without" && !p.cover))
);
filteredItems.sort((a,b)=>{
let A,B;
if(order==="created"){ A=a.created; B=b.created; }
else if(order==="fileName"){ A=a.file.name; B=b.file.name; }
else { A=a.title; B=b.title; }
return A < B ? (dir==="asc" ? -1 : 1)
: A > B ? (dir==="asc" ? 1 : -1)
: 0;
});
currentPage = 1;
render(filteredItems);
}
/=== EVENTOS ===/
setTimeout(()=>{
populateFilters();
["dv-g-type","dv-g-subtype","dv-g-status","dv-g-image","dv-g-order","dv-g-direction"].forEach(id =>
document.getElementById(id)?.addEventListener("change", applyFilters)
);
document.getElementById("dv-g-reset")?.addEventListener("click",()=>{
["dv-g-type","dv-g-subtype","dv-g-status","dv-g-image"].forEach(id => document.getElementById(id).value="");
document.getElementById("dv-g-order").value="title";
document.getElementById("dv-g-direction").value="asc";
applyFilters();
});
document.getElementById("dv-g-prev")?.addEventListener("click",()=>{
if(currentPage>1){ currentPage--; render(filteredItems); dv.container.scrollTop = 0; }
});
document.getElementById("dv-g-next")?.addEventListener("click",()=>{
if(currentPage*PAGE_SIZE<filteredItems.length){ currentPage++; render(filteredItems); dv.container.scrollTop = 0; }
});
document.getElementById("dv-g-top")?.addEventListener("click",()=> dv.container.scrollTop = 0);
filteredItems = items;
render(filteredItems);
},100);
/=== CSS RESPONSIVO INJETADO ===/
const css = `
.dv-gallery-controls {display:flex; gap:8px; flex-wrap:wrap; align-items:center; margin-bottom:1em;}
.dv-grid {display:grid; gap:12px; grid-template-columns: repeat(auto-fill, minmax(140px, 1fr));}
.dv-card {border-radius:10px; overflow:hidden; cursor:pointer; transition: transform 0.2s, box-shadow 0.2s; display:flex; flex-direction:column; background: var(--background-secondary);}
.dv-card:hover {transform: translateY(-4px); box-shadow:0 6px 16px rgba(0,0,0,0.2);}
.dv-card-media {position:relative; width:100%; height:0; padding-bottom:150%;}
.dv-card-media img {position:absolute; top:0; left:0; width:100%; height:100%; object-fit:cover; border-bottom:1px solid rgba(0,0,0,0.1);}
.dv-badges {position:absolute; top:6px; left:6px; display:flex; flex-direction:column; gap:4px;}
.dv-card-badge, .dv-card-subtype {padding:3px 7px; font-size:0.72rem; border-radius:4px; font-weight:600;}
.dv-card-body {padding:8px 6px; text-align:center; display:flex; flex-direction:column; gap:4px;}
.dv-card-title {font-weight:600; font-size:0.9rem; line-height:1.1rem; word-break:break-word;}
.dv-card-sub {font-size:0.75rem; opacity:0.7;}
.dv-card-rating {font-size:0.8rem;}
.dv-card-status {font-size:0.75rem; opacity:0.7;}
.dv-gallery-pages {display:flex; gap:8px; align-items:center; margin-top:1em;}
@media (max-width:768px) {.dv-grid {grid-template-columns: repeat(auto-fill, minmax(100px, 1fr));}}
@media (max-width:480px) {.dv-grid {grid-template-columns: repeat(auto-fill, minmax(80px, 1fr));} .dv-card-title {font-size:0.8rem;} .dv-card-sub {font-size:0.65rem;} .dv-card-rating {font-size:0.7rem;}}
`;
if(!document.getElementById("dv-gallery-style")) {
const style = document.createElement("style");
style.id="dv-gallery-style";
style.innerHTML = css;
document.head.appendChild(style);
}
Current Python import code:
# -*- coding: utf-8 -*- import os import requests from bs4 import BeautifulSoup import yaml import json from concurrent.futures import ThreadPoolExecutor, as_completed OUTPUT_DIR = r"C:\Users\Usuario\Documents\Gnosis\3- Bem estar\Hobbies e Inspirações\Coleções\Leituras\Livros" os.makedirs(OUTPUT_DIR, exist_ok=True) CHECKPOINT_FILE = os.path.join(OUTPUT_DIR, "checkpoint.json") MAX_WORKERS = 5 HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"} def safe_filename(s): return s.replace("/", "-").replace("\\", "-").replace(":", "-").strip() def write_md(filename, yaml_obj, body_md=""): path = os.path.join(OUTPUT_DIR, filename) if os.path.exists(path): return with open(path, "w", encoding="utf-8") as f: f.write("---\n") f.write(yaml.safe_dump(yaml_obj, sort_keys=False, allow_unicode=True)) f.write("---\n\n") f.write(body_md) def scrape_list_page(url): resp = requests.get(url, headers=HEADERS) resp.raise_for_status() soup = BeautifulSoup(resp.content, "html.parser") books = [] for book_row in soup.select("div.elementList"): title_tag = book_row.select_one("a.bookTitle span") author_tag = book_row.select_one("a.authorName span") rating_tag = book_row.select_one("span.minirating") cover_tag = book_row.select_one("img.bookCover") if not title_tag or not author_tag: continue title = title_tag.get_text(strip=True) author = author_tag.get_text(strip=True) avg_rating = rating_tag.get_text().split(" avg rating")[0].strip() if rating_tag else None cover_url = cover_tag['src'] if cover_tag and cover_tag.has_attr('src') else None books.append({ "title": title, "author": author, "average_rating": avg_rating, "image_url": cover_url, "genre": "Terror/Horror", }) return books def get_all_books_from_list(list_url): books = [] page = 1 while True: url = f"{list_url}?page={page}" print(f"[INFO] Scraping {url}") page_books = scrape_list_page(url) if not page_books: break books.extend(page_books) page += 1 return books def process_book(book, processed_set): title = book.get("title") autor = book.get("author") uid = f"{title}_{autor}" if uid in processed_set: print("[SKIP]", title) return None yaml_obj = { "title": {title: None}, "portugueseTitle": {title: None}, "englishTitle": {title: None}, "coverUrl": {book.get("image_url"): None}, "onlineRating": {book.get("average_rating") or "Desconhecido": None}, "type": "Livros", "subType": {"Terror": None}, "status": {"Desconhecido": None}, "rating": {"Desconhecido": None}, "autor": {autor: None}, } fname = safe_filename(f"{title}.md") md_body = f"# {title}\n\n**Autor:** {autor}\n\n**Gênero:** Terror\n\n**Rating online:** {book.get('average_rating')}\n\n" if book.get("image_url"): md_body += f"})\n" write_md(fname, yaml_obj, md_body) print("[OK]", title) return uid def main(): list_url = "https://www.goodreads.com/list/show/2455.The_Most_Disturbing_Books_Ever_Written" books = get_all_books_from_list(list_url) print(f"[INFO] Found {len(books)} books.") if os.path.exists(CHECKPOINT_FILE): with open(CHECKPOINT_FILE, "r", encoding="utf-8") as f: processed = set(json.load(f)) else: processed = set() with ThreadPoolExecutor(max_workers=MAX_WORKERS) as pool: futures = [pool.submit(process_book, b, processed) for b in books] for future in as_completed(futures): uid = future.result() if uid: processed.add(uid) with open(CHECKPOINT_FILE, "w", encoding="utf-8") as f: json.dump(list(processed), f, ensure_ascii=False, indent=2) print("[INFO] Import finished.") if __name__ == "__main__": main()