Source code for bookmarks

# -*- coding: utf-8 -*-
#
#       Copyright 2011 Liftoff Software Corporation
#

__doc__ = """\
bookmarks.py - A plugin for Gate One that adds fancy bookmarking capabilities.

Hooks
-----
This Python plugin file implements the following hooks::

    hooks = {
        'Web': [
            (r"/bookmarks/fetchicon", FaviconHandler),
            (r"/bookmarks/export", ExportHandler),
            (r"/bookmarks/import", ImportHandler),
        ],
        'WebSocket': {
            'bookmarks_sync': save_bookmarks,
            'bookmarks_get': get_bookmarks,
            'bookmarks_deleted': delete_bookmarks,
            'bookmarks_rename_tags': rename_tags,
        }
    }

Docstrings
----------
"""

# Meta
__version__ = '1.0'
__license__ = "GNU AGPLv3 or Proprietary (see LICENSE.txt)"
__version_info__ = (1, 0)
__author__ = 'Dan McDougall <daniel.mcdougall@liftoffsoftware.com>'

# Python stdlib
import os, sys, logging, time
from functools import partial

# Our stuff
from gateone import BaseHandler
from utils import noop, json_encode

# Tornado stuff
import tornado.web
from tornado.escape import json_decode

# 3rd party stuff
# The following two lines let us import modules in the "dependencies" dir
plugin_path = os.path.split(__file__)[0]
sys.path.append(os.path.join(plugin_path, "dependencies"))

# Globals
boolean_fix = {
    True: True,
    False: False,
    'True': True,
    'False': False,
    'true': True,
    'false': False
}

# Helper functions
[docs]def unescape(s):
    """
    Unescape HTML code refs; c.f. http://wiki.python.org/moin/EscapingHtml
    """
    import re
    from htmlentitydefs import name2codepoint
    # Fix the missing one:
    name2codepoint['#39'] = 39
    return re.sub('&(%s);' % '|'.join(name2codepoint), lambda m: unichr(name2codepoint[m.group(1)]), s)

[docs]def parse_bookmarks_html(html):
    """
    Reads the Netscape-style bookmarks.html in string, *html* and returns a
    list of Bookmark objects.
    """
    # If this looks impossibly complicated it's because parsing HTML streams is
    # dark voodoo.  I had to push my brains back behind my eyes and into my ears
    # a few times while writing this.
    import html5lib
    out_list = []
    p = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parse(html)
    walker = html5lib.treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    level = 0
    tags = []
    h3on = False
    aon = False
    ddon = False
    add_date = None
    url = None
    icon = None
    name = ""
    notes = ""
    for token in stream:
        if 'name' in token:
            if token['name'] == 'dl':
                if token['type'] == 'StartTag':
                    level += 1
                elif token['type'] == 'EndTag':
                    if tags:
                        tags.pop()
                    level -= 1
            if token['name'] == 'dd':
                if token['type'] == 'StartTag':
                    ddon = True
                elif token['type'] == 'EndTag':
                    ddon = False
            if token['name'] == 'h3':
                if token['type'] == 'StartTag':
                    h3on = True
                elif token['type'] == 'EndTag':
                    h3on = False
            if token['name'] == 'a':
                if token['type'] == 'StartTag':
                    aon = True
                elif token['type'] == 'EndTag':
                    aon = False
                    if not add_date: # JavaScript-style 13-digit epoch:
                        add_date = int(round(time.time() * 1000))
                    add_date = int(add_date)
                    if add_date > 9999999999999: # Delicious goes out to 16
                        add_date = int(add_date/1000)
                    if add_date < 10000000000: # Chrome only goes to 10 digits
                        add_date = int(add_date*1000)
                    bm = {
                        'url': url,
                        'name': name.strip(),
                        'tags': [a for a in tags if a], # Remove empty tags
                        'notes': "", # notes
                        'visits': 0, # visits
                        'updated': add_date, # updated
                        'created': add_date, # created
                        'updateSequenceNum': 0, # updateSequenceNum
                        'images': {'favicon': icon}
                    }
                    out_list.append(bm)
                    # Reset everything (just in case)
                    add_date = None
                    url = None
                    icon = None
                    name = ""
        if h3on:
            if token['data']:
                if type(token['data']) == str:
                    tags.append(token['data'])
                elif type(token['data']) == unicode:
                    tags.append(token['data'])
        if ddon: # Indicates that there's notes here
            if token['data']:
                if token['type'] == 'Characters':
                    # Notes get attached to the bookmark we just created
                    out_list[-1]['notes'] = unescape(token['data'].strip())
        if aon:
            if token['type'] == 'StartTag':
                # html5lib changed from using lists to using dicts at some point
                # after 0.90.  Hence the two conditionals below
                if isinstance(token['data'], list):
                    for tup in token['data']:
                        if tup[0] == 'add_date':
                            add_date = tup[1]
                        elif tup[0] == 'href':
                            url = tup[1]
                        elif tup[0] == 'icon':
                            icon = tup[1]
                        elif tup[0] == 'tags':
                            tags = tup[1].split(',') # Delicious-style
                elif isinstance(token['data'], dict):
                    for tup in token['data']:
                        if 'add_date' in tup:
                            add_date = token['data'][tup]
                        elif 'href' in tup:
                            url = token['data'][tup]
                        elif 'icon' in tup:
                            icon = token['data'][tup]
                        elif 'tags' in tup:
                            tags = token['data'][tup].split(',') # Delicious
            elif token['type'] == 'Characters':
                name += unescape(token['data'])
    return out_list

[docs]def get_json_tags(json_dict, url):
    """
    Recursively looks inside *json_dict* trying to find tags associated with the
    given *url*.  Returns the tags found as a list.
    """
    tags = []
    # This function has been brought to you by your favorite stock symbol
    if json_dict.has_key('root') and json_dict.has_key('children'):
        for item in json_dict['children']:
            if item['title'] == 'Tags':
                for child in item['children']:
                    if child['type'] == 'text/x-moz-place-container':
                        for subchild in child['children']:
                            if subchild['type'] == 'text/x-moz-place':
                                if subchild['uri'] == url:
                                    tags.append(child['title'])
                                        # "Ahhhhhhh"
                                            # "hhhhhhhh"
                                                # "hhhhhhh"
                                                    # "hhhhhh"
                                                        # "hhhhh"
                                                            # "!!!"
                                                                # <splat>
    return tags

[docs]def get_ns_json_bookmarks(json_dict, bookmarks):
    """
    Given a *json_dict*, updates *urls_list* with each URL as it is found
    within.

    .. note:: Only works with Netscape-style bookmarks.json files.
    """
    children = []
    if json_dict.has_key('children'):
        for child in json_dict['children']:
            if child['type'] == 'text/x-moz-place':
                if not bookmarks[0].has_key(child['uri']):
                    # Browser won't let you load file: URIs from HTTP pages
                    if child['uri'][0:6] not in ['place:', 'file:/']:
                        # Note the use of json_dict as bookmarks[1] here:
                        tags = get_json_tags(bookmarks[1], child['uri'])
                        if not tags:
                            tags = ['Untagged']
                        if child.has_key("annos"):
                            notes = child["annos"]
                        else:
                            notes = ""
                        if child['lastModified'] > 9999999999999:
                            # Chop off the microseconds to make it 13 digits
                            child['lastModified'] = int(child['lastModified']/1000)
                        elif child['lastModified'] < 10000000000:
                            child['lastModified'] = int(child['lastModified']*1000)
                        if child['dateAdded'] > 9999999999999: # Delicious
                            # Chop off the microseconds to make it 13 digits
                            child['dateAdded'] = int(child['dateAdded']/1000)
                        elif child['dateAdded'] < 10000000000: # Chrome
                            child['dateAdded'] = int(child['dateAdded']*1000)
                        bm = {
                            'url': child['uri'],
                            'name': child['title'].strip(),
                            'tags': tags,
                            'notes': notes,
                            'visits': 0, # visits
                            'updated': child['lastModified'], # updated
                            'created': child['dateAdded'], # created
                            'updateSequenceNum': 0, # updateSequenceNum
                            'images': {} # No icons in JSON :(
                        }
                        bookmarks[0].update({child['uri']: bm})
            elif child['type'] == 'text/x-moz-place-container':
                get_ns_json_bookmarks(child, bookmarks)

[docs]def parse_bookmarks_json(json_str):
    """
    Given *json_str*, returns a list of bookmark objects representing the data
    contained therein.
    """
    # TODO: Get this recognizing and parsing our own JSON format.
    json_obj = json.loads(json_str)
    out_list = []
    bookmarks = [{}, json_obj] # Inside a list for persistence
    get_ns_json_bookmarks(json_obj, bookmarks) # Updates urls in-place
    for url, bm in bookmarks[0].items():
        out_list.append(bm)
    return out_list

# Data Structures
[docs]class BookmarksDB(object):
    """
    Used to read and write bookmarks to a file on disk.  Can also synchronize
    a given list of bookmarks with what's on disk.  Uses a given bookmark's
    ``updateSequenceNum`` to track what wins the "who is newer?" comparison.
    """
    def __init__(self, user_dir, user):
        """
        Sets up our bookmarks database object and reads everything in.
        """
        self.bookmarks = [] # For temp storage of all bookmarks
        self.user_dir = user_dir
        self.user = user
        users_dir = os.path.join(user_dir, user) # "User's dir"
        self.bookmarks_path = os.path.join(users_dir, "bookmarks.json")
        # Read existing bookmarks into self.bookmarks
        self.open_bookmarks()

[docs]    def open_bookmarks(self):
        """
        Opens the bookmarks stored in self.user_dir.  If not present, an
        empty file will be created.
        """
        if not os.path.exists(self.bookmarks_path):
            with open(self.bookmarks_path, 'w') as f:
                f.write('[]') # That's an empty JSON list
            return # Default of empty list will do
        with open(self.bookmarks_path) as f:
            self.bookmarks = json_decode(f.read())

[docs]    def save_bookmarks(self):
        """
        Saves self.bookmarks to self.bookmarks_path as a JSON-encoded list.
        """
        with open(self.bookmarks_path, 'w') as f:
            f.write(json_encode(self.bookmarks))

[docs]    def sync_bookmarks(self, bookmarks):
        """
        Given *bookmarks*, synchronize with self.bookmarks doing conflict
        resolution and whatnot.
        """
        highest_USN = self.get_highest_USN()
        changed = False # For if there's changes that need to be written
        updated_bookmarks = [] # For bookmarks that are newer on the server
        for bm in bookmarks:
            if bm['url'] == "web+deleted:bookmarks/":
                # Remove the existing deleted entry if it exists
                for j, deleted_bm in enumerate(bm['notes']):
                    if deleted_bm['url'] == bookmark['url']:
                        # Remove the deleted bookmark entry
                        bm['notes'].pop(j)
            found_existing = False
            for i, db_bookmark in enumerate(self.bookmarks):
                if bm['url'] == db_bookmark['url']:
                    # Bookmark already exists, check which is newer
                    found_existing = True
                    if bm['updateSequenceNum'] > db_bookmark['updateSequenceNum']:
                        # The given bookmark is newer than what's in the DB
                        self.bookmarks[i] = bm # Replace it
                        highest_USN += 1 # Increment the USN
                        self.bookmarks[i]['updateSequenceNum'] = highest_USN
                        changed = True
                    elif bm['updateSequenceNum'] < db_bookmark['updateSequenceNum']:
                        # DB has a newer bookmark.  Add it to the list to send
                        # to the client.
                        updated_bookmarks.append(db_bookmark)
                    # Otherwise the USNs are equal and there's nothing to do
            if not found_existing:
                # This is a new bookmark.  Add it
                highest_USN += 1 # Increment the USN
                bm['updateSequenceNum'] = highest_USN
                self.bookmarks.append(bm)
                changed = True # So it will be saved
        if changed:
            # Write the changes to disk
            self.save_bookmarks()
        # Let the client know what's newer on the server
        return updated_bookmarks

[docs]    def delete_bookmark(self, bookmark):
        """Deletes the given *bookmark*."""
        highest_USN = self.get_highest_USN()
        for i, db_bookmark in enumerate(self.bookmarks):
            if bookmark['url'] == db_bookmark['url']:
                # Remove it
                deleted = self.bookmarks.pop(i)
                # Add it to the list of deleted bookmarks
                special_deleted_bm = None
                for bm in self.bookmarks:
                    if bm['url'] == "web+deleted:bookmarks/":
                        special_deleted_bm = bm
                # The deleted bookmarks 'bookmark' is just a list of URLs that
                # have been deleted along with the time it happened.  This lets
                # us keep multiple browsers in sync with what's been deleted
                # so we don't inadvertently end up re-adding bookmarks that were
                # deleted by another client.
                if not special_deleted_bm:
                    # Make our first entry
                    special_deleted_bm = {
                        'url': "web+deleted:bookmarks/",
                        'name': "Deleted Bookmarks",
                        'tags': [],
                        'notes': [bookmark],
                        'visits': highest_USN + 1,
                        'updated': int(round(time.time() * 1000)),
                        'created': int(round(time.time() * 1000)),
                        'updateSequenceNum': 0,
                        'images': {}
                    }
                    self.bookmarks.append(special_deleted_bm)
                else:
                    # Check for pre-existing
                    updated = False
                    for j, deleted_bm in enumerate(special_deleted_bm['notes']):
                        if deleted_bm['url'] == bookmark['url']:
                            # Update it in place
                            special_deleted_bm['notes'][j] = bookmark
                            updated = True
                    if not updated:
                        special_deleted_bm['notes'].append(bookmark)
                    highest_USN += 1
                    special_deleted_bm['updateSequenceNum'] = highest_USN
                break
        # Save the change to disk
        self.save_bookmarks()

[docs]    def get_bookmarks(self, updateSequenceNum=0):
        """
        Returns a list of bookmarks newer than *updateSequenceNum*.
        If *updateSequenceNum* is 0 or undefined, all bookmarks will be
        returned.
        """
        out_bookmarks = []
        for bm in self.bookmarks:
            if bm['updateSequenceNum'] > updateSequenceNum:
                out_bookmarks.append(bm)
        return out_bookmarks

[docs]    def get_highest_USN(self):
        """Returns the highest updateSequenceNum in self.bookmarks"""
        highest_USN = 0
        for bm in self.bookmarks:
            if bm['updateSequenceNum'] > highest_USN:
                highest_USN = bm['updateSequenceNum']
        return highest_USN

[docs]    def rename_tag(self, old_tag, new_tag):
        """
        Goes through all bookmarks and renames all tags named *old_tag* to be
        *new_tag*.
        """
        highest_USN = self.get_highest_USN()
        for bm in self.bookmarks:
            if old_tag in bm['tags']:
                highest_USN += 1
                i = bm['tags'].index(old_tag)
                bm['tags'][i] = new_tag
                # Made a change so we need to increment the USN to ensure sync
                bm['updateSequenceNum'] = highest_USN
                bm['updated'] = int(round(time.time() * 1000))
        # Save the change to disk
        self.save_bookmarks()

# Handlers
[docs]class FaviconHandler(BaseHandler):
    """
    Retrives the biggest favicon-like icon at the given URL.  It will try to
    fetch apple-touch-icons (which can be nice and big) before it falls back
    to grabbing the favicon.

    .. note:: Works with GET and POST requests but POST is preferred since it keeps the URL from winding up in the server logs.
    """
    # Valid favicon mime types
    favicon_mimetypes = [
        'image/vnd.microsoft.icon',
        'image/x-icon',
        'image/png',
        'image/svg+xml',
        'image/gif',
        'image/jpeg'
    ]
    @tornado.web.asynchronous
    def get(self):
        self.process()

    @tornado.web.asynchronous
    def post(self):
        self.process()

    def process(self):
        url = self.get_argument("url")
        http = tornado.httpclient.AsyncHTTPClient()
        callback = partial(self.on_response, url)
        http.fetch(url, callback, connect_timeout=5.0, request_timeout=5.0)

[docs]    def get_favicon_url(self, html):
        """
        Parses *html* looking for a favicon URL.  Returns a tuple of::

            (<url>, <mimetime>)

        If no favicon can be found, returns::

            (None, None)
        """
        import html5lib
        p = html5lib.HTMLParser(
            tree=html5lib.treebuilders.getTreeBuilder("dom"))
        dom_tree = p.parse(html)
        walker = html5lib.treewalkers.getTreeWalker("dom")
        stream = walker(dom_tree)
        fetch_url = None
        mimetype = None
        icon = False
        found_token = None
        for token in stream:
            if 'name' in token:
                if token['name'] == 'link':
                    for attr in token['data']:
                        if attr[0] == 'rel':
                            if 'shortcut icon' in attr[1].lower():
                                found_token = token
                                icon = True
                        elif attr[0] == 'href':
                            fetch_url = attr[1]
                        elif attr[0] == 'type':
                            mimetype = attr[1]
                    if fetch_url and icon:
                        if not mimetype:
                            mimetype = "image/x-icon"
                        if mimetype in self.favicon_mimetypes:
                            return (fetch_url, mimetype)
        return (None, None)

    def on_response(self, url, response):
        try:
            from urlparse import urlparse
        except ImportError: # Python 3.X
            from urllib import parse as urlparse
        if response.error:
            self.write('Unable to fetch icon.')
            self.finish()
            return
        fetch_url = None
        try:
            content = response.body.decode('utf-8')
        except UnicodeDecodeError:
            content = response.body
        parsed_url = urlparse(url)
        (fetch_url, mimetype) = self.get_favicon_url(content)
        if fetch_url:
            if not fetch_url.startswith('http'):
                fetch_url = '%s://%s%s' % (
                    parsed_url.scheme, parsed_url.netloc, fetch_url)
        if not mimetype:
            mimetype = "image/x-icon" # Default
        if not fetch_url:
            fetch_url = '%s://%s/favicon.ico' % (
                parsed_url.scheme, parsed_url.netloc)
        if fetch_url.startswith('http://') or fetch_url.startswith('https://'):
            noop()
        else:
            raise tornado.web.HTTPError(404)
        http = tornado.httpclient.AsyncHTTPClient()
        callback = partial(self.icon_fetch, url, mimetype)
        try:
            http.fetch(
                fetch_url,
                callback,
                connect_timeout=5.0,
                request_timeout=5.0
            )
        except gaierror: # No address associated with hostname
            self.write('Unable to fetch icon.')
            self.finish()
            return

[docs]    def icon_multifetch(self, urls, response):
        """
        Fetches the icon at the given URLs, stopping when it finds the biggest.
        If an icon is not found, calls itself again with the next icon URL.
        If the icon is found, writes it to the client and finishes the request.
        """
        if response.error:
            if urls:
                url = urls.pop()
                http = tornado.httpclient.AsyncHTTPClient()
                callback = partial(self.icon_multifetch, urls)
                try:
                    http.fetch(url, callback)
                except gaierror:
                    raise tornado.web.HTTPError(404)
            else:
                raise tornado.web.HTTPError(404)
        else:
            if 'Content-Type' in response.headers:
                mimetype = response.headers['Content-Type']
                self.set_header("Content-Type", mimetype)
            else:
                mimetype = "image/vnd.microsoft.icon"
                self.set_header("Content-Type", mimetype)
            data_uri = "data:%s;base64,%s" % (
                mimetype,
                response.body.encode('base64').replace('\n', '')
            )
            self.write(data_uri)
            self.finish()

[docs]    def icon_fetch(self, url, mimetype, response):
        """Returns the fetched icon to the client."""
        if response.error:
            self.write('Unable to fetch icon.')
            self.finish()
            return
        data_uri = "data:%s;base64,%s" % (
            mimetype,
            response.body.encode('base64').replace('\n', '')
        )
        self.set_header("Content-Type", mimetype)
        self.write(data_uri)
        self.finish()

[docs]class ImportHandler(tornado.web.RequestHandler):
    """
    Takes a bookmarks.html in a POST and returns a list of bookmarks in JSON
    format
    """
    @tornado.web.asynchronous
    def post(self):
        html = self.request.body
        if html.startswith('{'): # This is a JSON file
            bookmarks = parse_bookmarks_json(html)
        else:
            bookmarks = parse_bookmarks_html(html)
        self.write(tornado.escape.json_encode(bookmarks))
        self.finish()
        # NOTE: The client will take care of storing these at the next sync

[docs]class ExportHandler(tornado.web.RequestHandler):
    """
    Takes a JSON-encoded list of bookmarks and returns a Netscape-style HTML
    file.
    """
    @tornado.web.asynchronous
    def post(self):
        bookmarks = self.get_argument("bookmarks")
        bookmarks = tornado.escape.json_decode(bookmarks)
        self.set_header("Content-Type", "text/html")
        self.set_header(
            "Content-Disposition", 'attachment; filename="bookmarks.html"')
        templates_path = os.path.join(plugin_path, "templates")
        bookmarks_html =  os.path.join(templates_path, "bookmarks.html")
        self.render(bookmarks_html, bookmarks=bookmarks)

# WebSocket commands (not the same as handlers)
[docs]def save_bookmarks(bookmarks, tws):
    """
    Handles saving *bookmarks* for clients.
    """
    out_dict = {
        'updates': [],
        'count': 0,
        'errors': []
    }
    try:
        user = tws.get_current_user()['upn']
        bookmarks_db = BookmarksDB(tws.settings['user_dir'], user)
        updates = bookmarks_db.sync_bookmarks(bookmarks)
        out_dict.update({
            'updates': updates,
            'count': len(bookmarks),
        })
        out_dict['updateSequenceNum'] = bookmarks_db.get_highest_USN()
    except Exception as e:
        import traceback
        logging.error("Got exception synchronizing bookmarks: %s" % e)
        traceback.print_exc(file=sys.stdout)
        out_dict['errors'].append(str(e))
    if out_dict['errors']:
        out_dict['result'] = "Upload completed but errors were encountered."
    else:
        out_dict['result'] = "Upload successful"
    message = {'bookmarks_save_result': out_dict}
    tws.write_message(json_encode(message))

[docs]def get_bookmarks(updateSequenceNum, tws):
    """
    Returns a JSON-encoded list of bookmarks updated since the last
    *updateSequenceNum*.

    If *updateSequenceNum* resolves to False, all bookmarks will be sent to
    the client.
    """
    user = tws.get_current_user()['upn']
    bookmarks_db = BookmarksDB(tws.settings['user_dir'], user)
    if updateSequenceNum:
        updateSequenceNum = int(updateSequenceNum)
    else: # This will force a full download
        updateSequenceNum = 0
    updated_bookmarks = bookmarks_db.get_bookmarks(updateSequenceNum)
    message = {'bookmarks_updated': updated_bookmarks}
    tws.write_message(json_encode(message))

[docs]def delete_bookmarks(deleted_bookmarks, tws):
    """
    Handles deleting bookmars given a *deleted_bookmarks* list.
    """
    user = tws.get_current_user()['upn']
    bookmarks_db = BookmarksDB(tws.settings['user_dir'], user)
    out_dict = {
        'result': "",
        'count': 0,
        'errors': [],
    }
    try:
        for bookmark in deleted_bookmarks:
            out_dict['count'] += 1
            bookmarks_db.delete_bookmark(bookmark)
        out_dict['result'] = "Success"
    except Exception as e: # TODO: Make this more specific
        logging.error("delete_bookmarks error: %s" % e)
        import traceback
        traceback.print_exc(file=sys.stdout)
        out_dict['result'] = "Errors"
        out_dict['errors'].append(str(e))
    message = {'bookmarks_delete_result': out_dict}
    tws.write_message(json_encode(message))

[docs]def rename_tags(renamed_tags, tws):
    """
    Handles renaming tags.
    """
    user = tws.get_current_user()['upn']
    bookmarks_db = BookmarksDB(tws.settings['user_dir'], user)
    out_dict = {
        'result': "",
        'count': 0,
        'errors': [],
        'updates': []
    }
    for pair in renamed_tags:
        old_name, new_name = pair.split(',')
        bookmarks_db.rename_tag(old_name, new_name)
        out_dict['count'] += 1
    message = {'bookmarks_renamed_tags': out_dict}
    tws.write_message(json_encode(message))

hooks = {
    'Web': [
        (r"/bookmarks/fetchicon", FaviconHandler),
        (r"/bookmarks/export", ExportHandler),
        (r"/bookmarks/import", ImportHandler),
    ],
    'WebSocket': {
        'bookmarks_sync': save_bookmarks,
        'bookmarks_get': get_bookmarks,
        'bookmarks_deleted': delete_bookmarks,
        'bookmarks_rename_tags': rename_tags,
    }
}
Navigation

Source code for bookmarks

Quick search

Navigation