added https to couchdb + cleanup + added couchdb to docs

2024-11-25 16:34:30 +08:00 · 2019-11-08 10:57:10 +01:00 · 2019-11-08 10:57:10 +01:00 · e69f5de584
commit e69f5de584
parent afb0afa970
8 changed files with 44 additions and 30 deletions
--- a/.env
+++ b/.env
@ -1,5 +0,0 @@
-COUCHDB_USER=user
-COUCHDB_PASSWORD=password
-COUCHDB_NAME=couchdb
-COUCHDB_PORT_5984_TCP_ADDR=couchdb
-COUCHDB_PORT_5984_TCP_PORT=5984
--- a/README.md
+++ b/README.md
@ -5,7 +5,7 @@ A Powerful Spider(Web Crawler) System in Python. **[TRY IT NOW!][Demo]**

 - Write script in Python
 - Powerful WebUI with script editor, task monitor, project manager and result viewer
- [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
+- [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
 - [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
 - Task priority, retry, periodical, recrawl by age, etc...
 - Distributed architecture, Crawl Javascript pages, Python 2.{6,7}, 3.{3,4,5,6} support, etc...
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@ -1,6 +1,7 @@
 version: "3.7"

 # docker build ./ -t pyspider:latest
+# replace /path/to/dir/ to point to config_example.json

 services:
  rabbitmq:
@ -16,8 +17,12 @@ services:
      - pyspider
    ports:
      - "5984:5984"
-    env_file: .env
-
+    environment:
+      - COUCHDB_NAME=couchdb
+      - COUCHDB_USER=user
+      - COUCHDB_PASSWORD=password
+      - COUCHDB_HTTPS=true
+  # OR we can replace couchdb with mysql
  #mysql:
  #  image: mysql:latest
  #  container_name: mysql
@ -27,15 +32,13 @@ services:
  #    - MYSQL_ALLOW_EMPTY_PASSWORD=yes
  #  networks:
  #    - pyspider
-  #  env_file: .env
  phantomjs:
    image: pyspider:latest
    container_name: phantomjs
    networks:
      - pyspider
-    env_file: .env
    volumes:
-      - /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
+      - /path/to/dir/config_example.json:/opt/pyspider/config.json
    command: -c config.json phantomjs
    depends_on:
      - couchdb
@ -46,9 +49,8 @@ services:
    container_name: result
    networks:
      - pyspider
-    env_file: .env
    volumes:
-      - /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
+      - /path/to/dir/config_example.json:/opt/pyspider/config.json
    command: -c config.json result_worker
    depends_on:
      - couchdb
@ -59,9 +61,8 @@ services:
    image: pyspider:latest
    networks:
      - pyspider
-    env_file: .env
    volumes:
-      - /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
+      - /path/to/dir/config_example.json:/opt/pyspider/config.json
    command: -c config.json processor
    depends_on:
      - couchdb
@ -72,9 +73,8 @@ services:
    container_name: fetcher
    networks:
      - pyspider
-    env_file: .env
    volumes:
-      - /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
+      - /path/to/dir/config_example.json:/opt/pyspider/config.json
    command : -c config.json fetcher
    depends_on:
      - couchdb
@ -85,9 +85,8 @@ services:
    container_name: scheduler
    networks:
      - pyspider
-    env_file: .env
    volumes:
-      - /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
+      - /path/to/dir/config_example.json:/opt/pyspider/config.json
    command: -c config.json scheduler
    depends_on:
      - couchdb
@ -100,9 +99,8 @@ services:
      - "5050:5000"
    networks:
      - pyspider
-    env_file: .env
    volumes:
-      - /Users/Keith/Documents/Projects/python/python_projects/pyspider/pyspider/config_example.json:/opt/pyspider/config.json
+      - /path/to/dir/config_example.json:/opt/pyspider/config.json
    environment:
      - SCHEDULER_PORT_23333_TCP_ADDR=scheduler
    command: -c config.json webui
--- a/docs/Command-Line.md
+++ b/docs/Command-Line.md
@ -72,6 +72,8 @@ sqlite:
 mongodb:
    mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
    more: http://docs.mongodb.org/manual/reference/connection-string/
+couchdb:
+    couchdb+type://[username:password@]host[:port]
 sqlalchemy:
    sqlalchemy+postgresql+type://user:passwd@host:port/database
    sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database
--- a/docs/Deployment.md
+++ b/docs/Deployment.md
@ -8,7 +8,7 @@ To deploy pyspider in product environment, running component in each process and
 Installation
 ------------

-To deploy pyspider components in each single processes, you need at least one database service. pyspider now supports [MySQL](http://www.mysql.com/), [MongoDB](http://www.mongodb.org/) and [PostgreSQL](http://www.postgresql.org/). You can choose one of them.
+To deploy pyspider components in each single processes, you need at least one database service. pyspider now supports [MySQL](http://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](http://www.mongodb.org/) and [PostgreSQL](http://www.postgresql.org/). You can choose one of them.

 And you need a message queue service to connect the components together. You can use [RabbitMQ](http://www.rabbitmq.com/) or [Redis](http://redis.io/) as message queue.

@ -63,6 +63,8 @@ sqlite:
 mongodb:
    mongodb+type://[username:password@]host1[:port1][,host2[:port2],...[,hostN[:portN]]][/[database][?options]]
    more: http://docs.mongodb.org/manual/reference/connection-string/
+couchdb:
+    couchdb+type://[username:password@]host[:port][?options]]
 sqlalchemy:
    sqlalchemy+postgresql+type://user:passwd@host:port/database
    sqlalchemy+mysql+mysqlconnector+type://user:passwd@host:port/database
--- a/docs/index.md
+++ b/docs/index.md
@ -5,7 +5,7 @@ A Powerful Spider(Web Crawler) System in Python. **[TRY IT NOW!][Demo]**

 - Write script in Python
 - Powerful WebUI with script editor, task monitor, project manager and result viewer
- [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
+- [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
 - [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
 - Task priority, retry, periodical, recrawl by age, etc...
 - Distributed architecture, Crawl Javascript pages, Python 2&3, etc...
--- a/pyspider/database/init.py
+++ b/pyspider/database/init.py
@ -34,7 +34,7 @@ def connect_database(url):
    elasticsearch:
        elasticsearch+type://host:port/?index=pyspider
    couchdb:
-        couchdb+type://host[:port]
+        couchdb+type://[username:password@]host[:port]
    local:
        local+projectdb://filepath,filepath

@ -207,14 +207,29 @@ def _connect_elasticsearch(parsed, dbtype):


 def _connect_couchdb(parsed, dbtype, url):
-    # TODO: Add https + auth as parameters
-    url = "http://" + parsed.netloc + "/"
+    if os.environ.get('COUCHDB_HTTPS'):
+        url = "https://" + parsed.netloc + "/"
+    else:
+        url = "http://" + parsed.netloc + "/"
    params = {}
-    params['username'] = os.environ.get('COUCHDB_USER') or 'user'
-    params['password'] = os.environ.get('COUCHDB_PASSWORD') or 'password'

+    username = None
+    password = None
+    if '@' in parsed.netloc:
+        # netloc looks like: 'user:pass@couchdb:999'
+        url = parsed.netloc[parsed.netloc.find("@")+1:]
+        # extract the username and password
+        username = parsed.netloc[:parsed.netloc.find(":")]
+        password = parsed.netloc[parsed.netloc.find(":")+1:parsed.netloc.find("@")]
+
+    # default to env, then url, then hard coded
+    params['username'] = os.environ.get('COUCHDB_USER') or username or 'user'
+    params['password'] = os.environ.get('COUCHDB_PASSWORD') or password or 'password'
+
+    # create required CouchDB databases if not already present
    requests.put(url+"_users")
    requests.put(url+"_replicator")
+    
    # create the admin user
    # NOTE: Over docker, this user is already created when COUCHDB_USER and COUCHDB_PASSWORD are set
    requests.put(url+'_node/_local/_config/admins/'+ params['username'],
--- a/pyspider/run.py
+++ b/pyspider/run.py
@ -114,8 +114,10 @@ def cli(ctx, **kwargs):
        elif os.environ.get('COUCHDB_NAME'):
            kwargs[db] = utils.Get(lambda db=db: connect_database(
                'couchdb+%s://%s:%s/%s' % (
-                    db, os.environ['COUCHDB_PORT_5984_TCP_ADDR'],
-                    os.environ['COUCHDB_PORT_5984_TCP_PORT'], db)))
+                    db,
+                    os.environ['COUCHDB_PORT_5984_TCP_ADDR'] or 'couchdb',
+                    os.environ['COUCHDB_PORT_5984_TCP_PORT'] or '5984',
+                    db)))
        elif ctx.invoked_subcommand == 'bench':
            if kwargs['data_path'] == './data':
                kwargs['data_path'] += '/bench'